From b61c75c6895fbcb2c335fca5a6c777c043ef60bc Mon Sep 17 00:00:00 2001 From: Hubert Kario Date: Sat, 21 Nov 2020 20:54:59 +0100 Subject: [PATCH 1/2] add speed without precomputation --- speed.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/speed.py b/speed.py index 19132e0d..297e94f8 100644 --- a/speed.py +++ b/speed.py @@ -19,7 +19,8 @@ def do(setup_statements, statement): "{name:>16}{sep:1} {siglen:>6} {keygen:>9{form}}{unit:1} " "{keygen_inv:>9{form_inv}} {sign:>9{form}}{unit:1} " "{sign_inv:>9{form_inv}} {verify:>9{form}}{unit:1} " - "{verify_inv:>9{form_inv}}" + "{verify_inv:>9{form_inv}} {verify_single:>13{form}}{unit:1} " + "{verify_single_inv:>14{form_inv}}" ) print( @@ -31,6 +32,8 @@ def do(setup_statements, statement): sign_inv="sign/s", verify="verify", verify_inv="verify/s", + verify_single="no PC verify", + verify_single_inv="no PC verify/s", name="", sep="", unit="", @@ -54,6 +57,7 @@ def do(setup_statements, statement): keygen = do([S1], S2) sign = do([S1, S2, S3], S4) verf = do([S1, S2, S3, S4, S5, S6], S7) + verf_single = do([S1, S2, S3, S4, S5], S7) import ecdsa c = getattr(ecdsa, curve) @@ -70,6 +74,8 @@ def do(setup_statements, statement): sign_inv=1.0 / sign, verify=verf, verify_inv=1.0 / verf, + verify_single=verf_single, + verify_single_inv=1.0 / verf_single, form=".5f", form_inv=".2f", ) From 351c40b8cd6d0bc32349bf18a6f24b1674bd4123 Mon Sep 17 00:00:00 2001 From: Hubert Kario Date: Sat, 21 Nov 2020 21:02:43 +0100 Subject: [PATCH 2/2] use NAF for mul_add() while we were using the more clever algorithm, with addition of two points at the time when possible, it's possible to do it slightly faster by performing something similar but with 2-ary NAF this speeds up single-shot verify by about 5% with python int() and by about 4% with gmpy's mpz() --- README.md | 65 +++++++++++---------- src/ecdsa/ellipticcurve.py | 116 ++++++++++++++++++++++++------------- src/ecdsa/test_jacobi.py | 61 +++++++++++++++++++ tox.ini | 2 +- 4 files changed, 172 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 20e85bec..dd38dfed 100644 --- a/README.md +++ b/README.md @@ -72,32 +72,35 @@ pip install ecdsa[gmpy] The following table shows how long this library takes to generate keypairs (`keygen`), to sign data (`sign`), to verify those signatures (`verify`), -and to derive a shared secret (`ecdh`). +to derive a shared secret (`ecdh`), and +to verify the signatures with no key specific precomputation (`no PC verify`). All those values are in seconds. For convenience, the inverses of those values are also provided: how many keys per second can be generated (`keygen/s`), how many signatures can be made per second (`sign/s`), how many signatures can be verified -per second (`verify/s`), and how many shared secrets can be derived per second -(`ecdh/s`). The size in bytes of a raw signature (generally the smallest +per second (`verify/s`), how many shared secrets can be derived per second +(`ecdh/s`), and how many signatures with no key specific +precomputation can be verified per second (`no PC verify/s`). The size of raw +signature (generally the smallest way a signature can be encoded) is also provided in the `siglen` column. Use `tox -e speed` to generate this table on your own computer. On an Intel Core i7 4790K @ 4.0GHz I'm getting the following performance: ``` - siglen keygen keygen/s sign sign/s verify verify/s - NIST192p: 48 0.00035s 2893.02 0.00038s 2620.53 0.00069s 1458.92 - NIST224p: 56 0.00043s 2307.11 0.00048s 2092.00 0.00088s 1131.33 - NIST256p: 64 0.00056s 1793.70 0.00061s 1639.87 0.00113s 883.79 - NIST384p: 96 0.00116s 864.33 0.00124s 806.29 0.00233s 429.87 - NIST521p: 132 0.00221s 452.16 0.00234s 427.31 0.00460s 217.19 - SECP256k1: 64 0.00056s 1772.65 0.00061s 1628.73 0.00110s 912.13 - BRAINPOOLP160r1: 40 0.00026s 3801.86 0.00029s 3401.11 0.00052s 1930.47 - BRAINPOOLP192r1: 48 0.00034s 2925.73 0.00038s 2634.34 0.00070s 1438.06 - BRAINPOOLP224r1: 56 0.00044s 2287.98 0.00048s 2083.87 0.00088s 1137.52 - BRAINPOOLP256r1: 64 0.00056s 1774.11 0.00061s 1628.25 0.00112s 890.71 - BRAINPOOLP320r1: 80 0.00081s 1238.18 0.00087s 1146.71 0.00151s 661.95 - BRAINPOOLP384r1: 96 0.00117s 855.47 0.00124s 804.56 0.00241s 414.83 - BRAINPOOLP512r1: 128 0.00223s 447.99 0.00234s 427.49 0.00437s 229.09 + siglen keygen keygen/s sign sign/s verify verify/s no PC verify no PC verify/s + NIST192p: 48 0.00033s 2991.13 0.00036s 2740.86 0.00067s 1502.11 0.00136s 737.54 + NIST224p: 56 0.00042s 2360.67 0.00046s 2190.16 0.00083s 1201.83 0.00170s 587.79 + NIST256p: 64 0.00053s 1872.02 0.00057s 1743.08 0.00103s 968.53 0.00219s 457.36 + NIST384p: 96 0.00110s 907.45 0.00116s 861.63 0.00218s 459.38 0.00445s 224.92 + NIST521p: 132 0.00214s 467.72 0.00223s 448.70 0.00430s 232.76 0.00888s 112.66 + SECP256k1: 64 0.00054s 1841.11 0.00058s 1722.33 0.00111s 903.07 0.00216s 464.01 + BRAINPOOLP160r1: 40 0.00026s 3780.81 0.00029s 3422.67 0.00054s 1863.09 0.00109s 914.93 + BRAINPOOLP192r1: 48 0.00034s 2942.79 0.00037s 2710.56 0.00070s 1435.59 0.00138s 724.79 + BRAINPOOLP224r1: 56 0.00044s 2278.35 0.00047s 2145.32 0.00090s 1115.34 0.00182s 549.72 + BRAINPOOLP256r1: 64 0.00055s 1832.95 0.00059s 1704.50 0.00110s 911.02 0.00234s 427.22 + BRAINPOOLP320r1: 80 0.00077s 1305.78 0.00082s 1222.47 0.00156s 640.27 0.00321s 311.56 + BRAINPOOLP384r1: 96 0.00112s 893.07 0.00118s 849.32 0.00228s 438.75 0.00478s 209.35 + BRAINPOOLP512r1: 128 0.00213s 470.08 0.00221s 451.98 0.00419s 238.70 0.00940s 106.44 ecdh ecdh/s NIST192p: 0.00110s 910.70 @@ -118,20 +121,20 @@ On an Intel Core i7 4790K @ 4.0GHz I'm getting the following performance: To test performance with `gmpy2` loaded, use `tox -e speedgmpy2`. On the same machine I'm getting the following performance with `gmpy2`: ``` - siglen keygen keygen/s sign sign/s verify verify/s - NIST192p: 48 0.00017s 5945.50 0.00018s 5544.66 0.00033s 3002.54 - NIST224p: 56 0.00021s 4742.14 0.00022s 4463.52 0.00044s 2248.59 - NIST256p: 64 0.00024s 4155.73 0.00025s 3994.28 0.00047s 2105.34 - NIST384p: 96 0.00041s 2415.06 0.00043s 2316.41 0.00085s 1177.18 - NIST521p: 132 0.00072s 1391.14 0.00074s 1359.63 0.00140s 716.31 - SECP256k1: 64 0.00024s 4216.50 0.00025s 3994.52 0.00047s 2120.57 - BRAINPOOLP160r1: 40 0.00014s 7038.99 0.00015s 6501.55 0.00029s 3397.79 - BRAINPOOLP192r1: 48 0.00017s 5983.18 0.00018s 5626.08 0.00035s 2843.62 - BRAINPOOLP224r1: 56 0.00021s 4727.54 0.00022s 4464.86 0.00043s 2326.84 - BRAINPOOLP256r1: 64 0.00024s 4221.00 0.00025s 4010.26 0.00049s 2046.40 - BRAINPOOLP320r1: 80 0.00032s 3142.14 0.00033s 3009.15 0.00061s 1652.88 - BRAINPOOLP384r1: 96 0.00041s 2415.98 0.00043s 2340.35 0.00083s 1198.77 - BRAINPOOLP512r1: 128 0.00064s 1567.27 0.00066s 1526.33 0.00127s 788.51 + siglen keygen keygen/s sign sign/s verify verify/s no PC verify no PC verify/s + NIST192p: 48 0.00017s 5878.39 0.00018s 5670.66 0.00034s 2971.38 0.00067s 1484.97 + NIST224p: 56 0.00021s 4705.08 0.00022s 4587.19 0.00040s 2499.96 0.00088s 1140.97 + NIST256p: 64 0.00024s 4252.73 0.00024s 4108.48 0.00049s 2038.80 0.00096s 1043.03 + NIST384p: 96 0.00041s 2455.84 0.00042s 2406.31 0.00079s 1260.03 0.00172s 580.61 + NIST521p: 132 0.00070s 1419.16 0.00072s 1392.50 0.00139s 719.35 0.00307s 325.96 + SECP256k1: 64 0.00024s 4228.87 0.00024s 4086.32 0.00047s 2124.86 0.00096s 1037.53 + BRAINPOOLP160r1: 40 0.00014s 6932.12 0.00015s 6678.36 0.00030s 3387.90 0.00056s 1784.02 + BRAINPOOLP192r1: 48 0.00017s 5886.05 0.00017s 5720.63 0.00034s 2941.22 0.00067s 1490.87 + BRAINPOOLP224r1: 56 0.00021s 4748.89 0.00022s 4638.15 0.00041s 2460.86 0.00089s 1128.91 + BRAINPOOLP256r1: 64 0.00024s 4248.00 0.00024s 4135.19 0.00045s 2209.69 0.00099s 1006.45 + BRAINPOOLP320r1: 80 0.00032s 3096.85 0.00033s 3012.43 0.00065s 1547.07 0.00137s 728.60 + BRAINPOOLP384r1: 96 0.00041s 2436.12 0.00042s 2396.23 0.00083s 1211.13 0.00176s 568.39 + BRAINPOOLP512r1: 128 0.00063s 1580.09 0.00064s 1562.78 0.00129s 778.09 0.00279s 358.12 ecdh ecdh/s NIST192p: 0.00051s 1960.26 diff --git a/src/ecdsa/ellipticcurve.py b/src/ecdsa/ellipticcurve.py index 25565df9..0617c6ea 100644 --- a/src/ecdsa/ellipticcurve.py +++ b/src/ecdsa/ellipticcurve.py @@ -39,7 +39,7 @@ from gmpy2 import mpz GMPY = True -except ImportError: +except ImportError: # pragma: no branch try: from gmpy import mpz @@ -57,7 +57,7 @@ class CurveFp(object): """Elliptic Curve over the field of integers modulo a prime.""" - if GMPY: + if GMPY: # pragma: no branch def __init__(self, p, a, b, h=None): """ @@ -75,7 +75,7 @@ def __init__(self, p, a, b, h=None): # gmpy with it self.__h = h - else: + else: # pragma: no branch def __init__(self, p, a, b, h=None): """ @@ -164,12 +164,12 @@ def __init__(self, curve, x, y, z, order=None, generator=False): # since it's generally better (faster) to use scaled points vs unscaled # ones, use writer-biased RWLock for locking: self._update_lock = RWLock() - if GMPY: + if GMPY: # pragma: no branch self.__x = mpz(x) self.__y = mpz(y) self.__z = mpz(z) self.__order = order and mpz(order) - else: + else: # pragma: no branch self.__x = x self.__y = y self.__z = z @@ -359,7 +359,8 @@ def from_affine(point, generator=False): point.curve(), point.x(), point.y(), 1, point.order(), generator ) - # plese note that all the methods that use the equations from hyperelliptic + # please note that all the methods that use the equations from + # hyperelliptic # are formatted in a way to maximise performance. # Things that make code faster: multiplying instead of taking to the power # (`xx = x * x; xxxx = xx * xx % p` is faster than `xxxx = x**4 % p` and @@ -389,7 +390,7 @@ def _double(self, X1, Y1, Z1, p, a): """Add a point to itself, arbitrary z.""" if Z1 == 1: return self._double_with_z_1(X1, Y1, p, a) - if not Z1: + if not Y1 or not Z1: return 0, 0, 1 # after: # http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl @@ -579,11 +580,11 @@ def _naf(mult): if mult % 2: nd = mult % 4 if nd >= 2: - nd = nd - 4 - ret += [nd] + nd -= 4 + ret.append(nd) mult -= nd else: - ret += [0] + ret.append(0) mult //= 2 return ret @@ -621,15 +622,6 @@ def __mul__(self, other): return PointJacobi(self.__curve, X3, Y3, Z3, self.__order) - @staticmethod - def _leftmost_bit(x): - """Return integer with the same magnitude as x but only one bit set""" - assert x > 0 - result = 1 - while result <= x: - result = 2 * result - return result // 2 - def mul_add(self, self_mul, other, other_mul): """ Do two multiplications at the same time, add results. @@ -643,7 +635,7 @@ def mul_add(self, self_mul, other, other_mul): if not isinstance(other, PointJacobi): other = PointJacobi.from_affine(other) # when the points have precomputed answers, then multiplying them alone - # is faster (as it uses NAF) + # is faster (as it uses NAF and no point doublings) self._maybe_precompute() other._maybe_precompute() if self.__precompute and other.__precompute: @@ -653,32 +645,76 @@ def mul_add(self, self_mul, other, other_mul): self_mul = self_mul % self.__order other_mul = other_mul % self.__order - i = self._leftmost_bit(max(self_mul, other_mul)) * 2 + # (X3, Y3, Z3) is the accumulator X3, Y3, Z3 = 0, 0, 1 p, a = self.__curve.p(), self.__curve.a() - self = self.scale() - # after scaling, point is immutable, no need for locking - X1, Y1 = self.__x, self.__y - other = other.scale() - X2, Y2 = other.__x, other.__y - both = self + other - if both is INFINITY: - X4, Y4 = 0, 0 - else: - both.scale() - X4, Y4 = both.__x, both.__y + + # as we have 6 unique points to work with, we can't scale all of them, + # but do scale the ones that are used most often + # (post scale() points are immutable so no need for locking) + self.scale() + X1, Y1, Z1 = self.__x, self.__y, self.__z + other.scale() + X2, Y2, Z2 = other.__x, other.__y, other.__z + _double = self._double _add = self._add - while i > 1: + + # with NAF we have 3 options: no add, subtract, add + # so with 2 points, we have 9 combinations: + # 0, -A, +A, -B, -A-B, +A-B, +B, -A+B, +A+B + # so we need 4 combined points: + mAmB_X, mAmB_Y, mAmB_Z = _add(X1, -Y1, Z1, X2, -Y2, Z2, p) + pAmB_X, pAmB_Y, pAmB_Z = _add(X1, Y1, Z1, X2, -Y2, Z2, p) + mApB_X, mApB_Y, mApB_Z = _add(X1, -Y1, Z1, X2, Y2, Z2, p) + pApB_X, pApB_Y, pApB_Z = _add(X1, Y1, Z1, X2, Y2, Z2, p) + # when the self and other sum to infinity, we need to add them + # one by one to get correct result but as that's very unlikely to + # happen in regular operation, we don't need to optimise this case + if not pApB_Y or not pApB_Z: + return self * self_mul + other * other_mul + + # gmp object creation has cumulatively higher overhead than the + # speedup we get from calculating the NAF using gmp so ensure use + # of int() + self_naf = list(reversed(self._naf(int(self_mul)))) + other_naf = list(reversed(self._naf(int(other_mul)))) + # ensure that the lists are the same length (zip() will truncate + # longer one otherwise) + if len(self_naf) < len(other_naf): + self_naf = [0] * (len(other_naf) - len(self_naf)) + self_naf + elif len(self_naf) > len(other_naf): + other_naf = [0] * (len(self_naf) - len(other_naf)) + other_naf + + for A, B in zip(self_naf, other_naf): X3, Y3, Z3 = _double(X3, Y3, Z3, p, a) - i = i // 2 - if self_mul & i and other_mul & i: - X3, Y3, Z3 = _add(X3, Y3, Z3, X4, Y4, 1, p) - elif self_mul & i: - X3, Y3, Z3 = _add(X3, Y3, Z3, X1, Y1, 1, p) - elif other_mul & i: - X3, Y3, Z3 = _add(X3, Y3, Z3, X2, Y2, 1, p) + # conditions ordered from most to least likely + if A == 0: + if B == 0: + pass + elif B < 0: + X3, Y3, Z3 = _add(X3, Y3, Z3, X2, -Y2, Z2, p) + else: + assert B > 0 + X3, Y3, Z3 = _add(X3, Y3, Z3, X2, Y2, Z2, p) + elif A < 0: + if B == 0: + X3, Y3, Z3 = _add(X3, Y3, Z3, X1, -Y1, Z1, p) + elif B < 0: + X3, Y3, Z3 = _add(X3, Y3, Z3, mAmB_X, mAmB_Y, mAmB_Z, p) + else: + assert B > 0 + X3, Y3, Z3 = _add(X3, Y3, Z3, mApB_X, mApB_Y, mApB_Z, p) + else: + assert A > 0 + if B == 0: + X3, Y3, Z3 = _add(X3, Y3, Z3, X1, Y1, Z1, p) + elif B < 0: + X3, Y3, Z3 = _add(X3, Y3, Z3, pAmB_X, pAmB_Y, pAmB_Z, p) + else: + assert B > 0 + X3, Y3, Z3 = _add(X3, Y3, Z3, pApB_X, pApB_Y, pApB_Z, p) if not Y3 or not Z3: return INFINITY diff --git a/src/ecdsa/test_jacobi.py b/src/ecdsa/test_jacobi.py index 43ed6c12..4a938a32 100644 --- a/src/ecdsa/test_jacobi.py +++ b/src/ecdsa/test_jacobi.py @@ -367,6 +367,11 @@ def test_add_point_3_times(self): self.assertEqual(j_g * 3, j_g + j_g + j_g) + def test_mul_without_order(self): + j_g = PointJacobi(curve_256, generator_256.x(), generator_256.y(), 1) + + self.assertEqual(j_g * generator_256.order(), INFINITY) + def test_mul_add_inf(self): j_g = PointJacobi.from_affine(generator_256) @@ -405,6 +410,21 @@ def test_mul_add_to_mul(self): self.assertEqual(a, b) + def test_mul_add_differnt(self): + j_g = PointJacobi.from_affine(generator_256) + + w_a = j_g * 2 + + self.assertEqual(j_g.mul_add(1, w_a, 1), j_g * 3) + + def test_mul_add_slightly_different(self): + j_g = PointJacobi.from_affine(generator_256) + + w_a = j_g * 2 + w_b = j_g * 3 + + self.assertEqual(w_a.mul_add(1, w_b, 3), w_a * 1 + w_b * 3) + def test_mul_add(self): j_g = PointJacobi.from_affine(generator_256) @@ -428,11 +448,52 @@ def test_mul_add_large(self): j_g * (0xFF00 + 255 * 0xF0F0), j_g.mul_add(0xFF00, b, 0xF0F0) ) + def test_mul_add_with_infinity_as_result(self): + j_g = PointJacobi.from_affine(generator_256) + + order = generator_256.order() + + b = PointJacobi.from_affine(generator_256 * 256) + + self.assertEqual(j_g.mul_add(order % 256, b, order // 256), INFINITY) + + def test_mul_add_without_order(self): + j_g = PointJacobi(curve_256, generator_256.x(), generator_256.y(), 1) + + order = generator_256.order() + + w_b = generator_256 * 34 + w_b.scale() + + b = PointJacobi(curve_256, w_b.x(), w_b.y(), 1) + + self.assertEqual(j_g.mul_add(order % 34, b, order // 34), INFINITY) + + def test_mul_add_with_doubled_negation_of_itself(self): + j_g = PointJacobi.from_affine(generator_256 * 17) + + order = generator_256.order() + + dbl_neg = 2 * (-j_g) + + self.assertEqual(j_g.mul_add(4, dbl_neg, 2), INFINITY) + def test_equality(self): pj1 = PointJacobi(curve=CurveFp(23, 1, 1, 1), x=2, y=3, z=1, order=1) pj2 = PointJacobi(curve=CurveFp(23, 1, 1, 1), x=2, y=3, z=1, order=1) self.assertEqual(pj1, pj2) + def test_equality_with_invalid_object(self): + j_g = PointJacobi.from_affine(generator_256) + + self.assertNotEqual(j_g, 12) + + def test_equality_with_wrong_curves(self): + p_a = PointJacobi.from_affine(generator_256) + p_b = PointJacobi.from_affine(generator_224) + + self.assertNotEqual(p_a, p_b) + def test_pickle(self): pj = PointJacobi(curve=CurveFp(23, 1, 1, 1), x=2, y=3, z=1, order=1) self.assertEqual(pickle.loads(pickle.dumps(pj)), pj) diff --git a/tox.ini b/tox.ini index 2af988c7..4b072c3f 100644 --- a/tox.ini +++ b/tox.ini @@ -60,7 +60,7 @@ sitepackages=True whitelist_externals=coverage commands = coverage run --branch -m pytest --hypothesis-show-statistics {posargs:src/ecdsa} - coverage xml + coverage html coverage report -m [testenv:speed]