tlsfuzzer · tomato42 · Nov 30, 2020 · Nov 21, 2020 · Nov 21, 2020
diff --git a/README.md b/README.md
@@ -72,32 +72,35 @@ pip install ecdsa[gmpy]
 
 The following table shows how long this library takes to generate keypairs
 (`keygen`), to sign data (`sign`), to verify those signatures (`verify`),
-and to derive a shared secret (`ecdh`).
+to derive a shared secret (`ecdh`), and
+to verify the signatures with no key specific precomputation (`no PC verify`).
 All those values are in seconds.
 For convenience, the inverses of those values are also provided:
 how many keys per second can be generated (`keygen/s`), how many signatures
 can be made per second (`sign/s`), how many signatures can be verified
-per second (`verify/s`), and how many shared secrets can be derived per second
-(`ecdh/s`). The size in bytes of a raw signature (generally the smallest
+per second (`verify/s`), how many shared secrets can be derived per second
+(`ecdh/s`), and how many signatures with no key specific
+precomputation can be verified per second (`no PC verify/s`). The size of raw
+signature (generally the smallest
 way a signature can be encoded) is also provided in the `siglen` column.
 Use `tox -e speed` to generate this table on your own computer.
 On an Intel Core i7 4790K @ 4.0GHz I'm getting the following performance:
 
 ```
-                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s
-        NIST192p:     48   0.00035s   2893.02   0.00038s   2620.53   0.00069s   1458.92
-        NIST224p:     56   0.00043s   2307.11   0.00048s   2092.00   0.00088s   1131.33
-        NIST256p:     64   0.00056s   1793.70   0.00061s   1639.87   0.00113s    883.79
-        NIST384p:     96   0.00116s    864.33   0.00124s    806.29   0.00233s    429.87
-        NIST521p:    132   0.00221s    452.16   0.00234s    427.31   0.00460s    217.19
-       SECP256k1:     64   0.00056s   1772.65   0.00061s   1628.73   0.00110s    912.13
- BRAINPOOLP160r1:     40   0.00026s   3801.86   0.00029s   3401.11   0.00052s   1930.47
- BRAINPOOLP192r1:     48   0.00034s   2925.73   0.00038s   2634.34   0.00070s   1438.06
- BRAINPOOLP224r1:     56   0.00044s   2287.98   0.00048s   2083.87   0.00088s   1137.52
- BRAINPOOLP256r1:     64   0.00056s   1774.11   0.00061s   1628.25   0.00112s    890.71
- BRAINPOOLP320r1:     80   0.00081s   1238.18   0.00087s   1146.71   0.00151s    661.95
- BRAINPOOLP384r1:     96   0.00117s    855.47   0.00124s    804.56   0.00241s    414.83
- BRAINPOOLP512r1:    128   0.00223s    447.99   0.00234s    427.49   0.00437s    229.09
+                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s  no PC verify  no PC verify/s
+        NIST192p:     48   0.00033s   2991.13   0.00036s   2740.86   0.00067s   1502.11       0.00136s         737.54
+        NIST224p:     56   0.00042s   2360.67   0.00046s   2190.16   0.00083s   1201.83       0.00170s         587.79
+        NIST256p:     64   0.00053s   1872.02   0.00057s   1743.08   0.00103s    968.53       0.00219s         457.36
+        NIST384p:     96   0.00110s    907.45   0.00116s    861.63   0.00218s    459.38       0.00445s         224.92
+        NIST521p:    132   0.00214s    467.72   0.00223s    448.70   0.00430s    232.76       0.00888s         112.66
+       SECP256k1:     64   0.00054s   1841.11   0.00058s   1722.33   0.00111s    903.07       0.00216s         464.01
+ BRAINPOOLP160r1:     40   0.00026s   3780.81   0.00029s   3422.67   0.00054s   1863.09       0.00109s         914.93
+ BRAINPOOLP192r1:     48   0.00034s   2942.79   0.00037s   2710.56   0.00070s   1435.59       0.00138s         724.79
+ BRAINPOOLP224r1:     56   0.00044s   2278.35   0.00047s   2145.32   0.00090s   1115.34       0.00182s         549.72
+ BRAINPOOLP256r1:     64   0.00055s   1832.95   0.00059s   1704.50   0.00110s    911.02       0.00234s         427.22
+ BRAINPOOLP320r1:     80   0.00077s   1305.78   0.00082s   1222.47   0.00156s    640.27       0.00321s         311.56
+ BRAINPOOLP384r1:     96   0.00112s    893.07   0.00118s    849.32   0.00228s    438.75       0.00478s         209.35
+ BRAINPOOLP512r1:    128   0.00213s    470.08   0.00221s    451.98   0.00419s    238.70       0.00940s         106.44
 
                        ecdh     ecdh/s
         NIST192p:   0.00110s    910.70
@@ -118,20 +121,20 @@ On an Intel Core i7 4790K @ 4.0GHz I'm getting the following performance:
 To test performance with `gmpy2` loaded, use `tox -e speedgmpy2`.
 On the same machine I'm getting the following performance with `gmpy2`:
 ```
-                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s
-        NIST192p:     48   0.00017s   5945.50   0.00018s   5544.66   0.00033s   3002.54
-        NIST224p:     56   0.00021s   4742.14   0.00022s   4463.52   0.00044s   2248.59
-        NIST256p:     64   0.00024s   4155.73   0.00025s   3994.28   0.00047s   2105.34
-        NIST384p:     96   0.00041s   2415.06   0.00043s   2316.41   0.00085s   1177.18
-        NIST521p:    132   0.00072s   1391.14   0.00074s   1359.63   0.00140s    716.31
-       SECP256k1:     64   0.00024s   4216.50   0.00025s   3994.52   0.00047s   2120.57
- BRAINPOOLP160r1:     40   0.00014s   7038.99   0.00015s   6501.55   0.00029s   3397.79
- BRAINPOOLP192r1:     48   0.00017s   5983.18   0.00018s   5626.08   0.00035s   2843.62
- BRAINPOOLP224r1:     56   0.00021s   4727.54   0.00022s   4464.86   0.00043s   2326.84
- BRAINPOOLP256r1:     64   0.00024s   4221.00   0.00025s   4010.26   0.00049s   2046.40
- BRAINPOOLP320r1:     80   0.00032s   3142.14   0.00033s   3009.15   0.00061s   1652.88
- BRAINPOOLP384r1:     96   0.00041s   2415.98   0.00043s   2340.35   0.00083s   1198.77
- BRAINPOOLP512r1:    128   0.00064s   1567.27   0.00066s   1526.33   0.00127s    788.51
+                  siglen    keygen   keygen/s      sign     sign/s    verify   verify/s  no PC verify  no PC verify/s
+        NIST192p:     48   0.00017s   5878.39   0.00018s   5670.66   0.00034s   2971.38       0.00067s        1484.97
+        NIST224p:     56   0.00021s   4705.08   0.00022s   4587.19   0.00040s   2499.96       0.00088s        1140.97
+        NIST256p:     64   0.00024s   4252.73   0.00024s   4108.48   0.00049s   2038.80       0.00096s        1043.03
+        NIST384p:     96   0.00041s   2455.84   0.00042s   2406.31   0.00079s   1260.03       0.00172s         580.61
+        NIST521p:    132   0.00070s   1419.16   0.00072s   1392.50   0.00139s    719.35       0.00307s         325.96
+       SECP256k1:     64   0.00024s   4228.87   0.00024s   4086.32   0.00047s   2124.86       0.00096s        1037.53
+ BRAINPOOLP160r1:     40   0.00014s   6932.12   0.00015s   6678.36   0.00030s   3387.90       0.00056s        1784.02
+ BRAINPOOLP192r1:     48   0.00017s   5886.05   0.00017s   5720.63   0.00034s   2941.22       0.00067s        1490.87
+ BRAINPOOLP224r1:     56   0.00021s   4748.89   0.00022s   4638.15   0.00041s   2460.86       0.00089s        1128.91
+ BRAINPOOLP256r1:     64   0.00024s   4248.00   0.00024s   4135.19   0.00045s   2209.69       0.00099s        1006.45
+ BRAINPOOLP320r1:     80   0.00032s   3096.85   0.00033s   3012.43   0.00065s   1547.07       0.00137s         728.60
+ BRAINPOOLP384r1:     96   0.00041s   2436.12   0.00042s   2396.23   0.00083s   1211.13       0.00176s         568.39
+ BRAINPOOLP512r1:    128   0.00063s   1580.09   0.00064s   1562.78   0.00129s    778.09       0.00279s         358.12
 
                        ecdh     ecdh/s
         NIST192p:   0.00051s   1960.26

diff --git a/speed.py b/speed.py
@@ -19,7 +19,8 @@ def do(setup_statements, statement):
     "{name:>16}{sep:1} {siglen:>6} {keygen:>9{form}}{unit:1} "
     "{keygen_inv:>9{form_inv}} {sign:>9{form}}{unit:1} "
     "{sign_inv:>9{form_inv}} {verify:>9{form}}{unit:1} "
-    "{verify_inv:>9{form_inv}}"
+    "{verify_inv:>9{form_inv}} {verify_single:>13{form}}{unit:1} "
+    "{verify_single_inv:>14{form_inv}}"
 )
 
 print(
@@ -31,6 +32,8 @@ def do(setup_statements, statement):
         sign_inv="sign/s",
         verify="verify",
         verify_inv="verify/s",
+        verify_single="no PC verify",
+        verify_single_inv="no PC verify/s",
         name="",
         sep="",
         unit="",
@@ -54,6 +57,7 @@ def do(setup_statements, statement):
     keygen = do([S1], S2)
     sign = do([S1, S2, S3], S4)
     verf = do([S1, S2, S3, S4, S5, S6], S7)
+    verf_single = do([S1, S2, S3, S4, S5], S7)
     import ecdsa
 
     c = getattr(ecdsa, curve)
@@ -70,6 +74,8 @@ def do(setup_statements, statement):
             sign_inv=1.0 / sign,
             verify=verf,
             verify_inv=1.0 / verf,
+            verify_single=verf_single,
+            verify_single_inv=1.0 / verf_single,
             form=".5f",
             form_inv=".2f",
         )

diff --git a/src/ecdsa/ellipticcurve.py b/src/ecdsa/ellipticcurve.py
@@ -39,7 +39,7 @@
     from gmpy2 import mpz
 
     GMPY = True
-except ImportError:
+except ImportError:  # pragma: no branch
     try:
         from gmpy import mpz
 
@@ -57,7 +57,7 @@
 class CurveFp(object):
     """Elliptic Curve over the field of integers modulo a prime."""
 
-    if GMPY:
+    if GMPY:  # pragma: no branch
 
         def __init__(self, p, a, b, h=None):
             """
@@ -75,7 +75,7 @@ def __init__(self, p, a, b, h=None):
             # gmpy with it
             self.__h = h
 
-    else:
+    else:  # pragma: no branch
 
         def __init__(self, p, a, b, h=None):
             """
@@ -164,12 +164,12 @@ def __init__(self, curve, x, y, z, order=None, generator=False):
         # since it's generally better (faster) to use scaled points vs unscaled
         # ones, use writer-biased RWLock for locking:
         self._update_lock = RWLock()
-        if GMPY:
+        if GMPY:  # pragma: no branch
             self.__x = mpz(x)
             self.__y = mpz(y)
             self.__z = mpz(z)
             self.__order = order and mpz(order)
-        else:
+        else:  # pragma: no branch
             self.__x = x
             self.__y = y
             self.__z = z
@@ -359,7 +359,8 @@ def from_affine(point, generator=False):
             point.curve(), point.x(), point.y(), 1, point.order(), generator
         )
 
-    # plese note that all the methods that use the equations from hyperelliptic
+    # please note that all the methods that use the equations from
+    # hyperelliptic
     # are formatted in a way to maximise performance.
     # Things that make code faster: multiplying instead of taking to the power
     # (`xx = x * x; xxxx = xx * xx % p` is faster than `xxxx = x**4 % p` and
@@ -389,7 +390,7 @@ def _double(self, X1, Y1, Z1, p, a):
         """Add a point to itself, arbitrary z."""
         if Z1 == 1:
             return self._double_with_z_1(X1, Y1, p, a)
-        if not Z1:
+        if not Y1 or not Z1:
             return 0, 0, 1
         # after:
         # http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
@@ -579,11 +580,11 @@ def _naf(mult):
             if mult % 2:
                 nd = mult % 4
                 if nd >= 2:
-                    nd = nd - 4
-                ret += [nd]
+                    nd -= 4
+                ret.append(nd)
                 mult -= nd
             else:
-                ret += [0]
+                ret.append(0)
             mult //= 2
         return ret
 
@@ -621,15 +622,6 @@ def __mul__(self, other):
 
         return PointJacobi(self.__curve, X3, Y3, Z3, self.__order)
 
-    @staticmethod
-    def _leftmost_bit(x):
-        """Return integer with the same magnitude as x but only one bit set"""
-        assert x > 0
-        result = 1
-        while result <= x:
-            result = 2 * result
-        return result // 2
-
     def mul_add(self, self_mul, other, other_mul):
         """
         Do two multiplications at the same time, add results.
@@ -643,7 +635,7 @@ def mul_add(self, self_mul, other, other_mul):
         if not isinstance(other, PointJacobi):
             other = PointJacobi.from_affine(other)
         # when the points have precomputed answers, then multiplying them alone
-        # is faster (as it uses NAF)
+        # is faster (as it uses NAF and no point doublings)
         self._maybe_precompute()
         other._maybe_precompute()
         if self.__precompute and other.__precompute:
@@ -653,32 +645,76 @@ def mul_add(self, self_mul, other, other_mul):
             self_mul = self_mul % self.__order
             other_mul = other_mul % self.__order
 
-        i = self._leftmost_bit(max(self_mul, other_mul)) * 2
+        # (X3, Y3, Z3) is the accumulator
         X3, Y3, Z3 = 0, 0, 1
         p, a = self.__curve.p(), self.__curve.a()
-        self = self.scale()
-        # after scaling, point is immutable, no need for locking
-        X1, Y1 = self.__x, self.__y
-        other = other.scale()
-        X2, Y2 = other.__x, other.__y
-        both = self + other
-        if both is INFINITY:
-            X4, Y4 = 0, 0
-        else:
-            both.scale()
-            X4, Y4 = both.__x, both.__y
+
+        # as we have 6 unique points to work with, we can't scale all of them,
+        # but do scale the ones that are used most often
+        # (post scale() points are immutable so no need for locking)
+        self.scale()
+        X1, Y1, Z1 = self.__x, self.__y, self.__z
+        other.scale()
+        X2, Y2, Z2 = other.__x, other.__y, other.__z
+
         _double = self._double
         _add = self._add
-        while i > 1:
+
+        # with NAF we have 3 options: no add, subtract, add
+        # so with 2 points, we have 9 combinations:
+        # 0, -A, +A, -B, -A-B, +A-B, +B, -A+B, +A+B
+        # so we need 4 combined points:
+        mAmB_X, mAmB_Y, mAmB_Z = _add(X1, -Y1, Z1, X2, -Y2, Z2, p)
+        pAmB_X, pAmB_Y, pAmB_Z = _add(X1, Y1, Z1, X2, -Y2, Z2, p)
+        mApB_X, mApB_Y, mApB_Z = _add(X1, -Y1, Z1, X2, Y2, Z2, p)
+        pApB_X, pApB_Y, pApB_Z = _add(X1, Y1, Z1, X2, Y2, Z2, p)
+        # when the self and other sum to infinity, we need to add them
+        # one by one to get correct result but as that's very unlikely to
+        # happen in regular operation, we don't need to optimise this case
+        if not pApB_Y or not pApB_Z:
+            return self * self_mul + other * other_mul
+
+        # gmp object creation has cumulatively higher overhead than the
+        # speedup we get from calculating the NAF using gmp so ensure use
+        # of int()
+        self_naf = list(reversed(self._naf(int(self_mul))))
+        other_naf = list(reversed(self._naf(int(other_mul))))
+        # ensure that the lists are the same length (zip() will truncate
+        # longer one otherwise)
+        if len(self_naf) < len(other_naf):
+            self_naf = [0] * (len(other_naf) - len(self_naf)) + self_naf
+        elif len(self_naf) > len(other_naf):
+            other_naf = [0] * (len(self_naf) - len(other_naf)) + other_naf
+
+        for A, B in zip(self_naf, other_naf):
             X3, Y3, Z3 = _double(X3, Y3, Z3, p, a)
-            i = i // 2
 
-            if self_mul & i and other_mul & i:
-                X3, Y3, Z3 = _add(X3, Y3, Z3, X4, Y4, 1, p)
-            elif self_mul & i:
-                X3, Y3, Z3 = _add(X3, Y3, Z3, X1, Y1, 1, p)
-            elif other_mul & i:
-                X3, Y3, Z3 = _add(X3, Y3, Z3, X2, Y2, 1, p)
+            # conditions ordered from most to least likely
+            if A == 0:
+                if B == 0:
+                    pass
+                elif B < 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X2, -Y2, Z2, p)
+                else:
+                    assert B > 0
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X2, Y2, Z2, p)
+            elif A < 0:
+                if B == 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X1, -Y1, Z1, p)
+                elif B < 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, mAmB_X, mAmB_Y, mAmB_Z, p)
+                else:
+                    assert B > 0
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, mApB_X, mApB_Y, mApB_Z, p)
+            else:
+                assert A > 0
+                if B == 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, X1, Y1, Z1, p)
+                elif B < 0:
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, pAmB_X, pAmB_Y, pAmB_Z, p)
+                else:
+                    assert B > 0
+                    X3, Y3, Z3 = _add(X3, Y3, Z3, pApB_X, pApB_Y, pApB_Z, p)
 
         if not Y3 or not Z3:
             return INFINITY