diff --git a/configure.ac b/configure.ac index a2697e65ad..660fd94262 100644 --- a/configure.ac +++ b/configure.ac @@ -2559,6 +2559,7 @@ then # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" ENABLED_ARMASM_CRYPTO=yes + ENABLED_ARMASM_NEON=yes # Check for and set -mstrict-align compiler flag # Used to set assumption that Aarch64 systems will not handle @@ -2578,18 +2579,30 @@ then AC_MSG_NOTICE([64bit ARMv8 found, setting mcpu to generic+crypto]) ;; armv7a*) - AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon -DWOLFSSL_ARMASM_NO_HW_CRYPTO -DWOLFSSL_ARM_ARCH=7" + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon -DWOLFSSL_ARM_ARCH=7" # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" ENABLED_ARMASM_CRYPTO=no ENABLED_AESGCM_STREAM=no # not yet implemented + ENABLED_ARMASM_NEON=yes AC_MSG_NOTICE([32bit ARMv7-a found, setting mfpu to neon]) ;; + armv7m*) + # QEMU doesn't work with armv7-m + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-r -D__thumb__ -fomit-frame-pointer -DWOLFSSL_ARMASM_NO_HW_CRYPTO -DWOLFSSL_ARM_ARCH=7" + # Include options.h + AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" + ENABLED_ARMASM_CRYPTO=no + ENABLED_AESGCM_STREAM=no # not yet implemented + ENABLED_ARMASM_NEON=no + AC_MSG_NOTICE([32bit ARMv7-m found]) + ;; *) - AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8" + AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8 -marm" # Include options.h AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" ENABLED_ARMASM_CRYPTO=yes + ENABLED_ARMASM_NEON=yes AC_MSG_NOTICE([32bit ARMv8 found, setting mfpu to crypto-neon-fp-armv8]) ;; esac @@ -2606,6 +2619,17 @@ fi if test "$ENABLED_ARMASM_SM4" = "yes"; then AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SM4" fi +if test "$ENABLED_ARMASM_CRYPTO" = "no"; then + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_NO_HW_CRYPTO" +fi +if test "$ENABLED_ARMASM_NEON" = "no"; then + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_NO_NEON" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_ARMASM_NO_NEON" +fi + +if test "$ENABLED_ARMASM_INLINE" = "yes"; then + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_INLINE" +fi # Xilinx hardened crypto AC_ARG_ENABLE([xilinx], @@ -3598,6 +3622,7 @@ then fi AM_CFLAGS="$AM_CFLAGS -DHAVE_CURVE25519" + AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_CURVE25519" ENABLED_FEMATH=yes fi @@ -8379,6 +8404,8 @@ AS_IF([test "x$ENABLED_CERTEXT" = "xyes"], AS_IF([test "x$ENABLED_ED25519" = "xyes" && test "x$ENABLED_32BIT" = "xno"], [AM_CFLAGS="$AM_CFLAGS -DHAVE_ED25519"]) +AS_IF([test "x$ENABLED_ED25519" = "xyes" && test "x$ENABLED_32BIT" = "xno"], + [AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_ED25519"]) AS_IF([test "x$ENABLED_ED25519_SMALL" = "xyes"], [AM_CFLAGS="$AM_CFLAGS -DED25519_SMALL"]) @@ -8841,6 +8868,7 @@ AM_CONDITIONAL([BUILD_AESCCM],[test "x$ENABLED_AESCCM" = "xyes" || test "x$ENABL AM_CONDITIONAL([BUILD_ARMASM],[test "x$ENABLED_ARMASM" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM_INLINE],[test "x$ENABLED_ARMASM_INLINE" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM_CRYPTO],[test "x$ENABLED_ARMASM_CRYPTO" = "xyes"]) +AM_CONDITIONAL([BUILD_ARMASM_NEON],[test "x$ENABLED_ARMASM_NEON" = "xyes"]) AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"]) AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"]) AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"]) diff --git a/src/include.am b/src/include.am index 81c61310f2..7efe27e397 100644 --- a/src/include.am +++ b/src/include.am @@ -157,13 +157,17 @@ endif if BUILD_AES src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes.c -if BUILD_ARMASM +if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c if !BUILD_ARMASM_CRYPTO +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S -endif -endif -endif +endif !BUILD_ARMASM_INLINE +endif !BUILD_ARMASM_CRYPTO +endif BUILD_ARMASM_NEON +endif BUILD_AES if BUILD_AESNI src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_asm.S @@ -178,22 +182,31 @@ if BUILD_SHA src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha.c endif -if BUILD_ARMASM +if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S -endif +endif !BUILD_ARMASM_INLINE +else +if BUILD_ARMASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm.S +endif !BUILD_ARMASM_INLINE else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha256.c if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha256_asm.S -endif -endif +endif BUILD_INTELASM +endif !BUILD_ARMASM +endif !BUILD_ARMASM_NEON if BUILD_SHA512 -if BUILD_ARMASM +if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm_c.c @@ -201,24 +214,33 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-a else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S -endif +endif !BUILD_ARMASM_INLINE +else +if BUILD_ARMASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm.S +endif !BUILD_ARMASM_INLINE else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha512.c if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha512_asm.S -endif -endif -endif +endif BUILD_INTELASM +endif !BUILD_ARMASM +endif !BUILD_ARMASM_NEON +endif BUILD_SHA512 if BUILD_SHA3 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha3.c -if BUILD_ARMASM +if BUILD_ARMASM_NEON if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm.S -endif -endif +endif !BUILD_ARMASM_INLINE +endif BUILD_ARMASM_NEON if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha3_asm.S endif @@ -283,18 +305,27 @@ endif !BUILD_FIPS_CURRENT if !BUILD_FIPS_CURRENT src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha256.c -if BUILD_ARMASM +if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S -endif +endif !BUILD_ARMASM_INLINE +else +if BUILD_ARMASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm.S +endif !BUILD_ARMASM_INLINE else if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha256_asm.S -endif -endif +endif BUILD_INTELASM +endif !BUILD_ARMASM +endif !BUILD_ARMASM_NEON endif !BUILD_FIPS_CURRENT if BUILD_AFALG @@ -370,15 +401,19 @@ endif if !BUILD_FIPS_CURRENT if BUILD_AES src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes.c -if BUILD_ARMASM +if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-aes.c if !BUILD_ARMASM_CRYPTO +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S +endif !BUILD_ARMASM_INLINE endif !BUILD_ARMASM_CRYPTO -endif BUILD_ARMASM +endif BUILD_ARMASM_NEON if BUILD_AFALG src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/af_alg/afalg_aes.c -endif +endif BUILD_AFALG endif BUILD_AES endif !BUILD_FIPS_CURRENT @@ -402,7 +437,7 @@ endif !BUILD_FIPS_CURRENT if !BUILD_FIPS_CURRENT if BUILD_SHA512 -if BUILD_ARMASM +if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm_c.c @@ -410,26 +445,35 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-a else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S -endif +endif !BUILD_ARMASM_INLINE +else +if BUILD_ARMASM +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm.S +endif !BUILD_ARMASM_INLINE else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha512.c if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha512_asm.S -endif -endif -endif +endif BUILD_INTELASM +endif !BUILD_ARMASM +endif !BUILD_ARMASM_NEON +endif BUILD_SHA512 endif !BUILD_FIPS_CURRENT if !BUILD_FIPS_CURRENT if BUILD_SHA3 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha3.c -if BUILD_ARMASM +if BUILD_ARMASM_NEON if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha3-asm.S -endif -endif +endif !BUILD_ARMASM_INLINE +endif BUILD_ARMASM_NEON if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/sha3_asm.S endif @@ -569,7 +613,7 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/blake2s.c endif if BUILD_CHACHA -if BUILD_ARMASM +if BUILD_ARMASM_NEON src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha.c @@ -637,19 +681,29 @@ if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S else if BUILD_ARMASM +if BUILD_ARMASM_NEON if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S -endif +endif !BUILD_ARMASM_INLINE +else +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-curve25519_c.c +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-curve25519.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S +endif !BUILD_ARMASM_INLINE +endif !BUILD_ARMASM_NEON else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c -endif -endif -endif -endif +endif !BUILD_ARMASM +endif !BUILD_INTELASM +endif !BUILD_CURVE25519_SMALL +endif BUILD_FEMATH if BUILD_GEMATH if BUILD_ED25519_SMALL @@ -661,12 +715,22 @@ if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S else if BUILD_ARMASM +if BUILD_ARMASM_NEON if BUILD_ARMASM_INLINE src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S endif else +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-curve25519_c.c +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-curve25519.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S +endif +endif +else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c endif endif diff --git a/tests/api.c b/tests/api.c index 3cd00f20f7..464d279a09 100644 --- a/tests/api.c +++ b/tests/api.c @@ -20079,7 +20079,7 @@ static int test_wc_DsaExportKeyRaw(void) static int test_wc_ed25519_make_key(void) { EXPECT_DECLS; -#if defined(HAVE_ED25519) +#if defined(HAVE_ED25519) && defined(HAVE_ED25519_MAKE_KEY) ed25519_key key; WC_RNG rng; unsigned char pubkey[ED25519_PUB_KEY_SIZE]; @@ -20221,7 +20221,9 @@ static int test_wc_ed25519_import_public(void) ExpectIntEQ(wc_ed25519_init(&pubKey), 0); ExpectIntEQ(wc_InitRng(&rng), 0); +#ifdef HAVE_ED25519_MAKE_KEY ExpectIntEQ(wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &pubKey), 0); +#endif ExpectIntEQ(wc_ed25519_import_public_ex(in, inlen, &pubKey, 1), 0); ExpectIntEQ(XMEMCMP(in, pubKey.p, inlen), 0); @@ -20260,7 +20262,9 @@ static int test_wc_ed25519_import_private_key(void) ExpectIntEQ(wc_ed25519_init(&key), 0); ExpectIntEQ(wc_InitRng(&rng), 0); +#ifdef HAVE_ED25519_MAKE_KEY ExpectIntEQ(wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &key), 0); +#endif ExpectIntEQ(wc_ed25519_import_private_key_ex(privKey, privKeySz, pubKey, pubKeySz, &key, 1), 0); @@ -20308,13 +20312,32 @@ static int test_wc_ed25519_export(void) byte pub[ED25519_PUB_KEY_SIZE]; word32 privSz = sizeof(priv); word32 pubSz = sizeof(pub); +#ifndef HAVE_ED25519_MAKE_KEY + const byte privKey[] = { + 0xf8, 0x55, 0xb7, 0xb6, 0x49, 0x3f, 0x99, 0x9c, + 0x88, 0xe3, 0xc5, 0x42, 0x6a, 0xa4, 0x47, 0x4a, + 0xe4, 0x95, 0xda, 0xdb, 0xbf, 0xf8, 0xa7, 0x42, + 0x9d, 0x0e, 0xe7, 0xd0, 0x57, 0x8f, 0x16, 0x69 + }; + const byte pubKey[] = { + 0x42, 0x3b, 0x7a, 0xf9, 0x82, 0xcf, 0xf9, 0xdf, + 0x19, 0xdd, 0xf3, 0xf0, 0x32, 0x29, 0x6d, 0xfa, + 0xfd, 0x76, 0x4f, 0x68, 0xc2, 0xc2, 0xe0, 0x6c, + 0x47, 0xae, 0xc2, 0x55, 0x68, 0xac, 0x0d, 0x4d + }; +#endif XMEMSET(&key, 0, sizeof(ed25519_key)); XMEMSET(&rng, 0, sizeof(WC_RNG)); ExpectIntEQ(wc_ed25519_init(&key), 0); ExpectIntEQ(wc_InitRng(&rng), 0); +#ifdef HAVE_ED25519_MAKE_KEY ExpectIntEQ(wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &key), 0); +#else + ExpectIntEQ(wc_ed25519_import_private_key_ex(privKey, sizeof(privKey), + pubKey, sizeof(pubKey), &key, 1), 0); +#endif ExpectIntEQ(wc_ed25519_export_public(&key, pub, &pubSz), 0); ExpectIntEQ(pubSz, ED25519_KEY_SIZE); @@ -20350,13 +20373,32 @@ static int test_wc_ed25519_size(void) #if defined(HAVE_ED25519) ed25519_key key; WC_RNG rng; +#ifndef HAVE_ED25519_MAKE_KEY + const byte privKey[] = { + 0xf8, 0x55, 0xb7, 0xb6, 0x49, 0x3f, 0x99, 0x9c, + 0x88, 0xe3, 0xc5, 0x42, 0x6a, 0xa4, 0x47, 0x4a, + 0xe4, 0x95, 0xda, 0xdb, 0xbf, 0xf8, 0xa7, 0x42, + 0x9d, 0x0e, 0xe7, 0xd0, 0x57, 0x8f, 0x16, 0x69 + }; + const byte pubKey[] = { + 0x42, 0x3b, 0x7a, 0xf9, 0x82, 0xcf, 0xf9, 0xdf, + 0x19, 0xdd, 0xf3, 0xf0, 0x32, 0x29, 0x6d, 0xfa, + 0xfd, 0x76, 0x4f, 0x68, 0xc2, 0xc2, 0xe0, 0x6c, + 0x47, 0xae, 0xc2, 0x55, 0x68, 0xac, 0x0d, 0x4d + }; +#endif XMEMSET(&key, 0, sizeof(ed25519_key)); XMEMSET(&rng, 0, sizeof(WC_RNG)); ExpectIntEQ(wc_ed25519_init(&key), 0); ExpectIntEQ(wc_InitRng(&rng), 0); +#ifdef HAVE_ED25519_MAKE_KEY ExpectIntEQ(wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &key), 0); +#else + ExpectIntEQ(wc_ed25519_import_private_key_ex(privKey, sizeof(privKey), + pubKey, sizeof(pubKey), &key, 1), 0); +#endif ExpectIntEQ(wc_ed25519_size(&key), ED25519_KEY_SIZE); /* Test bad args. */ @@ -20395,13 +20437,32 @@ static int test_wc_ed25519_exportKey(void) word32 privSz = sizeof(priv); word32 pubSz = sizeof(pub); word32 privOnlySz = sizeof(privOnly); +#ifndef HAVE_ED25519_MAKE_KEY + const byte privKey[] = { + 0xf8, 0x55, 0xb7, 0xb6, 0x49, 0x3f, 0x99, 0x9c, + 0x88, 0xe3, 0xc5, 0x42, 0x6a, 0xa4, 0x47, 0x4a, + 0xe4, 0x95, 0xda, 0xdb, 0xbf, 0xf8, 0xa7, 0x42, + 0x9d, 0x0e, 0xe7, 0xd0, 0x57, 0x8f, 0x16, 0x69 + }; + const byte pubKey[] = { + 0x42, 0x3b, 0x7a, 0xf9, 0x82, 0xcf, 0xf9, 0xdf, + 0x19, 0xdd, 0xf3, 0xf0, 0x32, 0x29, 0x6d, 0xfa, + 0xfd, 0x76, 0x4f, 0x68, 0xc2, 0xc2, 0xe0, 0x6c, + 0x47, 0xae, 0xc2, 0x55, 0x68, 0xac, 0x0d, 0x4d + }; +#endif XMEMSET(&key, 0, sizeof(ed25519_key)); XMEMSET(&rng, 0, sizeof(WC_RNG)); ExpectIntEQ(wc_ed25519_init(&key), 0); ExpectIntEQ(wc_InitRng(&rng), 0); +#ifdef HAVE_ED25519_MAKE_KEY ExpectIntEQ(wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &key), 0); +#else + ExpectIntEQ(wc_ed25519_import_private_key_ex(privKey, sizeof(privKey), + pubKey, sizeof(pubKey), &key, 1), 0); +#endif ExpectIntEQ(wc_ed25519_export_private(&key, privOnly, &privOnlySz), 0); /* Test bad args. */ diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index d90a6e0ef9..4bea547e2e 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -8922,6 +8922,7 @@ void bench_curve25519KeyAgree(int useDeviceID) #ifdef HAVE_ED25519 void bench_ed25519KeyGen(void) { +#ifdef HAVE_ED25519_MAKE_KEY ed25519_key genKey; double start; int i, count; @@ -8938,12 +8939,15 @@ void bench_ed25519KeyGen(void) count += i; } while (bench_stats_check(start)); bench_stats_asym_finish("ED", 25519, desc[2], 0, count, start, 0); +#endif /* HAVE_ED25519_MAKE_KEY */ } void bench_ed25519KeySign(void) { +#ifdef HAVE_ED25519_MAKE_KEY int ret; +#endif ed25519_key genKey; #ifdef HAVE_ED25519_SIGN double start; @@ -8956,11 +8960,13 @@ void bench_ed25519KeySign(void) wc_ed25519_init(&genKey); +#ifdef HAVE_ED25519_MAKE_KEY ret = wc_ed25519_make_key(&gRng, ED25519_KEY_SIZE, &genKey); if (ret != 0) { printf("ed25519_make_key failed\n"); return; } +#endif #ifdef HAVE_ED25519_SIGN /* make dummy msg */ diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 17da9d652b..a615488f15 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -97,7 +97,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits #include #endif -#ifndef WOLFSSL_ARMASM +#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARMASM_NO_NEON) #ifdef WOLFSSL_IMX6_CAAM_BLOB /* case of possibly not using hardware acceleration for AES but using key @@ -4573,7 +4573,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv) #endif /* NEED_AES_CTR_SOFT */ #endif /* WOLFSSL_AES_COUNTER */ -#endif /* !WOLFSSL_ARMASM */ +#endif /* !WOLFSSL_ARMASM || WOLFSSL_ARMASM_NO_NEON */ /* @@ -4620,7 +4620,7 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz) #endif -#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) /* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */ #elif defined(WOLFSSL_AFALG) @@ -8851,7 +8851,7 @@ int wc_AesCcmCheckTagSize(int sz) return 0; } -#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) /* implementation located in wolfcrypt/src/port/arm/armv8-aes.c */ #elif defined(HAVE_COLDFIRE_SEC) diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index 91e6bf0e7d..07071324e9 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -28,7 +28,7 @@ D. J. Bernstein Public domain. */ -#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */ #else @@ -38,7 +38,7 @@ Public domain. #include -#if defined(HAVE_CHACHA) && !defined(WOLFSSL_ARMASM) +#if defined(HAVE_CHACHA) #include #include @@ -436,6 +436,6 @@ void wc_Chacha_purge_current_block(ChaCha* ctx) { } } -#endif /* HAVE_CHACHA*/ +#endif /* HAVE_CHACHA */ -#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */ diff --git a/wolfcrypt/src/curve25519.c b/wolfcrypt/src/curve25519.c index cf41e027c5..fea0105993 100644 --- a/wolfcrypt/src/curve25519.c +++ b/wolfcrypt/src/curve25519.c @@ -325,14 +325,11 @@ int wc_curve25519_shared_secret_ex(curve25519_key* private_key, } } #endif - if (ret != 0) { - ForceZero(&o, sizeof(o)); - return ret; + if (ret == 0) { + curve25519_copy_point(out, o.point, endian); + *outlen = CURVE25519_KEYSIZE; } - curve25519_copy_point(out, o.point, endian); - *outlen = CURVE25519_KEYSIZE; - ForceZero(&o, sizeof(o)); return ret; diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c index 3fa15ab3fa..11c51fd4d4 100644 --- a/wolfcrypt/src/ecc.c +++ b/wolfcrypt/src/ecc.c @@ -3061,6 +3061,7 @@ static int ecc_mulmod(const mp_int* k, ecc_point* P, ecc_point* Q, #endif int infinity; +#ifndef WC_NO_CACHE_RESISTANT #ifdef WOLFSSL_SMALL_STACK tmp = (mp_int*)XMALLOC(sizeof(mp_int), NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { @@ -3069,6 +3070,7 @@ static int ecc_mulmod(const mp_int* k, ecc_point* P, ecc_point* Q, #endif if (err == MP_OKAY) err = mp_init(tmp); +#endif /* Step 1: R[0] = P; R[1] = P */ /* R[0] = P */ @@ -3217,7 +3219,7 @@ static int ecc_mulmod(const mp_int* k, ecc_point* P, ecc_point* Q, if (err == MP_OKAY) err = mp_copy(R[0]->z, Q->z); -#ifdef WOLFSSL_SMALL_STACK +#if defined(WOLFSSL_SMALL_STACK) && !defined(WC_NO_CACHE_RESISTANT) XFREE(tmp, NULL, DYNAMIC_TYPE_ECC); #endif diff --git a/wolfcrypt/src/ed25519.c b/wolfcrypt/src/ed25519.c index 3b9988bc3f..aa82590d41 100644 --- a/wolfcrypt/src/ed25519.c +++ b/wolfcrypt/src/ed25519.c @@ -182,6 +182,7 @@ static int ed25519_hash(ed25519_key* key, const byte* in, word32 inLen, return ret; } +#ifdef HAVE_ED25519_MAKE_KEY int wc_ed25519_make_public(ed25519_key* key, unsigned char* pubKey, word32 pubKeySz) { @@ -267,6 +268,7 @@ int wc_ed25519_make_key(WC_RNG* rng, int keySz, ed25519_key* key) return ret; } +#endif /* HAVE_ED25519_MAKE_KEY */ #ifdef HAVE_ED25519_SIGN @@ -1236,6 +1238,7 @@ int wc_ed25519_export_key(ed25519_key* key, int wc_ed25519_check_key(ed25519_key* key) { int ret = 0; +#ifdef HAVE_ED25519_MAKE_KEY unsigned char pubKey[ED25519_PUB_KEY_SIZE]; if (!key->pubKeySet) @@ -1244,6 +1247,9 @@ int wc_ed25519_check_key(ed25519_key* key) ret = wc_ed25519_make_public(key, pubKey, sizeof(pubKey)); if (ret == 0 && XMEMCMP(pubKey, key->p, ED25519_PUB_KEY_SIZE) != 0) ret = PUBLIC_KEY_E; +#else + (void)key; +#endif /* HAVE_ED25519_MAKE_KEY */ return ret; } diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 2cc2c5fdeb..2f07e3a116 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -115,17 +115,6 @@ L_fe_init_get_flags: #else movq %rax, _fe_mul121666_p(%rip) #endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_sq2_avx2@GOTPCREL(%rip), %rax -#else - leaq _fe_sq2_avx2(%rip), %rax -#endif /* __APPLE__ */ -#ifndef __APPLE__ - movq fe_sq2_p@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) -#else - movq %rax, _fe_sq2_p(%rip) -#endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_invert_avx2@GOTPCREL(%rip), %rax #else @@ -148,6 +137,18 @@ L_fe_init_get_flags: #else movq %rax, _curve25519_p(%rip) #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ + movq fe_sq2_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_sq2_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_sq2_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_sq2_p(%rip) +#endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_pow22523_avx2@GOTPCREL(%rip), %rax #else @@ -160,82 +161,105 @@ L_fe_init_get_flags: movq %rax, _fe_pow22523_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax + movq ge_p1p1_to_p2_avx2@GOTPCREL(%rip), %rax +#else + leaq _ge_p1p1_to_p2_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq ge_p1p1_to_p2_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _ge_p1p1_to_p2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq ge_p1p1_to_p3_avx2@GOTPCREL(%rip), %rax +#else + leaq _ge_p1p1_to_p3_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq ge_p1p1_to_p3_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _ge_p1p1_to_p3_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq ge_p2_dbl_avx2@GOTPCREL(%rip), %rax #else - leaq _fe_ge_to_p2_avx2(%rip), %rax + leaq _ge_p2_dbl_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx + movq ge_p2_dbl_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else - movq %rax, _fe_ge_to_p2_p(%rip) + movq %rax, _ge_p2_dbl_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax + movq ge_madd_avx2@GOTPCREL(%rip), %rax #else - leaq _fe_ge_to_p3_avx2(%rip), %rax + leaq _ge_madd_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx + movq ge_madd_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else - movq %rax, _fe_ge_to_p3_p(%rip) + movq %rax, _ge_madd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax + movq ge_msub_avx2@GOTPCREL(%rip), %rax #else - leaq _fe_ge_dbl_avx2(%rip), %rax + leaq _ge_msub_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx + movq ge_msub_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else - movq %rax, _fe_ge_dbl_p(%rip) + movq %rax, _ge_msub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax + movq ge_add_avx2@GOTPCREL(%rip), %rax #else - leaq _fe_ge_madd_avx2(%rip), %rax + leaq _ge_add_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_madd_p@GOTPCREL(%rip), %rdx + movq ge_add_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else - movq %rax, _fe_ge_madd_p(%rip) + movq %rax, _ge_add_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax + movq ge_sub_avx2@GOTPCREL(%rip), %rax #else - leaq _fe_ge_msub_avx2(%rip), %rax + leaq _ge_sub_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_msub_p@GOTPCREL(%rip), %rdx + movq ge_sub_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else - movq %rax, _fe_ge_msub_p(%rip) + movq %rax, _ge_sub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_add_avx2@GOTPCREL(%rip), %rax + movq sc_reduce_avx2@GOTPCREL(%rip), %rax #else - leaq _fe_ge_add_avx2(%rip), %rax + leaq _sc_reduce_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_add_p@GOTPCREL(%rip), %rdx + movq sc_reduce_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else - movq %rax, _fe_ge_add_p(%rip) + movq %rax, _sc_reduce_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax + movq sc_muladd_avx2@GOTPCREL(%rip), %rax #else - leaq _fe_ge_sub_avx2(%rip), %rax + leaq _sc_muladd_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ - movq fe_ge_sub_p@GOTPCREL(%rip), %rdx + movq sc_muladd_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else - movq %rax, _fe_ge_sub_p(%rip) + movq %rax, _sc_muladd_p(%rip) #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ L_fe_init_flags_done: #ifndef __APPLE__ movq cpuFlagsSet@GOTPCREL(%rip), %rdx @@ -400,21 +424,19 @@ _fe_sub: movq 16(%rsi), %r8 movq 24(%rsi), %r9 subq (%rdx), %rax - movq $0x00, %r10 sbbq 8(%rdx), %rcx - movq $-19, %r11 sbbq 16(%rdx), %r8 - movq $0x7fffffffffffffff, %r12 sbbq 24(%rdx), %r9 - sbbq $0x00, %r10 - # Mask the modulus - andq %r10, %r11 - andq %r10, %r12 + sbbq %r11, %r11 + shldq $0x01, %r9, %r11 + movq $0x7fffffffffffffff, %r12 + imulq $-19, %r11 + andq %r12, %r9 # Add modulus (if underflow) - addq %r11, %rax - adcq %r10, %rcx - adcq %r10, %r8 - adcq %r12, %r9 + subq %r11, %rax + sbbq $0x00, %rcx + sbbq $0x00, %r8 + sbbq $0x00, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) @@ -443,21 +465,20 @@ _fe_add: addq (%rdx), %rax movq 16(%rsi), %r8 adcq 8(%rdx), %rcx - movq 24(%rsi), %r10 + movq 24(%rsi), %r9 adcq 16(%rdx), %r8 - movq $-19, %r11 - adcq 24(%rdx), %r10 + adcq 24(%rdx), %r9 + movq $0x00, %r11 + adcq $0x00, %r11 + shldq $0x01, %r9, %r11 movq $0x7fffffffffffffff, %r12 - movq %r10, %r9 - sarq $63, %r10 - # Mask the modulus - andq %r10, %r11 - andq %r10, %r12 + imulq $19, %r11 + andq %r12, %r9 # Sub modulus (if overflow) - subq %r11, %rax - sbbq %r10, %rcx - sbbq %r10, %r8 - sbbq %r12, %r9 + addq %r11, %rax + adcq $0x00, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) @@ -943,26 +964,6 @@ _fe_mul121666: #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_sq2 -.type fe_sq2,@function -.align 16 -fe_sq2: -#else -.section __TEXT,__text -.globl _fe_sq2 -.p2align 4 -_fe_sq2: -#endif /* __APPLE__ */ -#ifndef __APPLE__ - jmpq *fe_sq2_p(%rip) -#else - jmpq *_fe_sq2_p(%rip) -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.size fe_sq2,.-fe_sq2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text .globl fe_invert .type fe_invert,@function .align 16 @@ -1001,6 +1002,30 @@ _curve25519: #ifndef __APPLE__ .size curve25519,.-curve25519 #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl fe_sq2 +.type fe_sq2,@function +.align 16 +fe_sq2: +#else +.section __TEXT,__text +.globl _fe_sq2 +.p2align 4 +_fe_sq2: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_sq2_p(%rip) +#else + jmpq *_fe_sq2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_sq2,.-fe_sq2 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_pow22523 @@ -1021,146 +1046,206 @@ _fe_pow22523: #ifndef __APPLE__ .size fe_pow22523,.-fe_pow22523 #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl ge_p1p1_to_p2 +.type ge_p1p1_to_p2,@function +.align 16 +ge_p1p1_to_p2: +#else +.section __TEXT,__text +.globl _ge_p1p1_to_p2 +.p2align 4 +_ge_p1p1_to_p2: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_p1p1_to_p2_p(%rip) +#else + jmpq *_ge_p1p1_to_p2_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_p1p1_to_p2,.-ge_p1p1_to_p2 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl ge_p1p1_to_p3 +.type ge_p1p1_to_p3,@function +.align 16 +ge_p1p1_to_p3: +#else +.section __TEXT,__text +.globl _ge_p1p1_to_p3 +.p2align 4 +_ge_p1p1_to_p3: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *ge_p1p1_to_p3_p(%rip) +#else + jmpq *_ge_p1p1_to_p3_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size ge_p1p1_to_p3,.-ge_p1p1_to_p3 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_ge_to_p2 -.type fe_ge_to_p2,@function +.globl ge_p2_dbl +.type ge_p2_dbl,@function .align 16 -fe_ge_to_p2: +ge_p2_dbl: #else .section __TEXT,__text -.globl _fe_ge_to_p2 +.globl _ge_p2_dbl .p2align 4 -_fe_ge_to_p2: +_ge_p2_dbl: #endif /* __APPLE__ */ #ifndef __APPLE__ - jmpq *fe_ge_to_p2_p(%rip) + jmpq *ge_p2_dbl_p(%rip) #else - jmpq *_fe_ge_to_p2_p(%rip) + jmpq *_ge_p2_dbl_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ -.size fe_ge_to_p2,.-fe_ge_to_p2 +.size ge_p2_dbl,.-ge_p2_dbl #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_ge_to_p3 -.type fe_ge_to_p3,@function +.globl ge_madd +.type ge_madd,@function .align 16 -fe_ge_to_p3: +ge_madd: #else .section __TEXT,__text -.globl _fe_ge_to_p3 +.globl _ge_madd .p2align 4 -_fe_ge_to_p3: +_ge_madd: #endif /* __APPLE__ */ #ifndef __APPLE__ - jmpq *fe_ge_to_p3_p(%rip) + jmpq *ge_madd_p(%rip) #else - jmpq *_fe_ge_to_p3_p(%rip) + jmpq *_ge_madd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ -.size fe_ge_to_p3,.-fe_ge_to_p3 +.size ge_madd,.-ge_madd #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_ge_dbl -.type fe_ge_dbl,@function +.globl ge_msub +.type ge_msub,@function .align 16 -fe_ge_dbl: +ge_msub: #else .section __TEXT,__text -.globl _fe_ge_dbl +.globl _ge_msub .p2align 4 -_fe_ge_dbl: +_ge_msub: #endif /* __APPLE__ */ #ifndef __APPLE__ - jmpq *fe_ge_dbl_p(%rip) + jmpq *ge_msub_p(%rip) #else - jmpq *_fe_ge_dbl_p(%rip) + jmpq *_ge_msub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ -.size fe_ge_dbl,.-fe_ge_dbl +.size ge_msub,.-ge_msub #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_ge_madd -.type fe_ge_madd,@function +.globl ge_add +.type ge_add,@function .align 16 -fe_ge_madd: +ge_add: #else .section __TEXT,__text -.globl _fe_ge_madd +.globl _ge_add .p2align 4 -_fe_ge_madd: +_ge_add: #endif /* __APPLE__ */ #ifndef __APPLE__ - jmpq *fe_ge_madd_p(%rip) + jmpq *ge_add_p(%rip) #else - jmpq *_fe_ge_madd_p(%rip) + jmpq *_ge_add_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ -.size fe_ge_madd,.-fe_ge_madd +.size ge_add,.-ge_add #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_ge_msub -.type fe_ge_msub,@function +.globl ge_sub +.type ge_sub,@function .align 16 -fe_ge_msub: +ge_sub: #else .section __TEXT,__text -.globl _fe_ge_msub +.globl _ge_sub .p2align 4 -_fe_ge_msub: +_ge_sub: #endif /* __APPLE__ */ #ifndef __APPLE__ - jmpq *fe_ge_msub_p(%rip) + jmpq *ge_sub_p(%rip) #else - jmpq *_fe_ge_msub_p(%rip) + jmpq *_ge_sub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ -.size fe_ge_msub,.-fe_ge_msub +.size ge_sub,.-ge_sub #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_ge_add -.type fe_ge_add,@function +.globl sc_reduce +.type sc_reduce,@function .align 16 -fe_ge_add: +sc_reduce: #else .section __TEXT,__text -.globl _fe_ge_add +.globl _sc_reduce .p2align 4 -_fe_ge_add: +_sc_reduce: #endif /* __APPLE__ */ #ifndef __APPLE__ - jmpq *fe_ge_add_p(%rip) + jmpq *sc_reduce_p(%rip) #else - jmpq *_fe_ge_add_p(%rip) + jmpq *_sc_reduce_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ -.size fe_ge_add,.-fe_ge_add +.size sc_reduce,.-sc_reduce #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_ge_sub -.type fe_ge_sub,@function +.globl sc_muladd +.type sc_muladd,@function .align 16 -fe_ge_sub: +sc_muladd: #else .section __TEXT,__text -.globl _fe_ge_sub +.globl _sc_muladd .p2align 4 -_fe_ge_sub: +_sc_muladd: #endif /* __APPLE__ */ #ifndef __APPLE__ - jmpq *fe_ge_sub_p(%rip) + jmpq *sc_muladd_p(%rip) #else - jmpq *_fe_ge_sub_p(%rip) + jmpq *_sc_muladd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ -.size fe_ge_sub,.-fe_ge_sub +.size sc_muladd,.-sc_muladd #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#endif /* HAVE_ED25519 */ #ifndef __APPLE__ .data .type cpuFlagsSet, @object @@ -1223,18 +1308,6 @@ _fe_mul121666_p: #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_sq2_p, @object -.size fe_sq2_p,8 -fe_sq2_p: - .quad fe_sq2_x64 -#else -.section __DATA,__data -.p2align 2 -_fe_sq2_p: - .quad _fe_sq2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.data .type fe_invert_p, @object .size fe_invert_p,8 fe_invert_p: @@ -1257,6 +1330,19 @@ curve25519_p: _curve25519_p: .quad _curve25519_x64 #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.data +.type fe_sq2_p, @object +.size fe_sq2_p,8 +fe_sq2_p: + .quad fe_sq2_x64 +#else +.section __DATA,__data +.p2align 2 +_fe_sq2_p: + .quad _fe_sq2_x64 +#endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_pow22523_p, @object @@ -1271,89 +1357,114 @@ _fe_pow22523_p: #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_ge_to_p2_p, @object -.size fe_ge_to_p2_p,8 -fe_ge_to_p2_p: - .quad fe_ge_to_p2_x64 +.type ge_p1p1_to_p2_p, @object +.size ge_p1p1_to_p2_p,8 +ge_p1p1_to_p2_p: + .quad ge_p1p1_to_p2_x64 +#else +.section __DATA,__data +.p2align 2 +_ge_p1p1_to_p2_p: + .quad _ge_p1p1_to_p2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +.type ge_p1p1_to_p3_p, @object +.size ge_p1p1_to_p3_p,8 +ge_p1p1_to_p3_p: + .quad ge_p1p1_to_p3_x64 #else .section __DATA,__data .p2align 2 -_fe_ge_to_p2_p: - .quad _fe_ge_to_p2_x64 +_ge_p1p1_to_p3_p: + .quad _ge_p1p1_to_p3_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_ge_to_p3_p, @object -.size fe_ge_to_p3_p,8 -fe_ge_to_p3_p: - .quad fe_ge_to_p3_x64 +.type ge_p2_dbl_p, @object +.size ge_p2_dbl_p,8 +ge_p2_dbl_p: + .quad ge_p2_dbl_x64 #else .section __DATA,__data .p2align 2 -_fe_ge_to_p3_p: - .quad _fe_ge_to_p3_x64 +_ge_p2_dbl_p: + .quad _ge_p2_dbl_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_ge_dbl_p, @object -.size fe_ge_dbl_p,8 -fe_ge_dbl_p: - .quad fe_ge_dbl_x64 +.type ge_madd_p, @object +.size ge_madd_p,8 +ge_madd_p: + .quad ge_madd_x64 #else .section __DATA,__data .p2align 2 -_fe_ge_dbl_p: - .quad _fe_ge_dbl_x64 +_ge_madd_p: + .quad _ge_madd_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_ge_madd_p, @object -.size fe_ge_madd_p,8 -fe_ge_madd_p: - .quad fe_ge_madd_x64 +.type ge_msub_p, @object +.size ge_msub_p,8 +ge_msub_p: + .quad ge_msub_x64 #else .section __DATA,__data .p2align 2 -_fe_ge_madd_p: - .quad _fe_ge_madd_x64 +_ge_msub_p: + .quad _ge_msub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_ge_msub_p, @object -.size fe_ge_msub_p,8 -fe_ge_msub_p: - .quad fe_ge_msub_x64 +.type ge_add_p, @object +.size ge_add_p,8 +ge_add_p: + .quad ge_add_x64 #else .section __DATA,__data .p2align 2 -_fe_ge_msub_p: - .quad _fe_ge_msub_x64 +_ge_add_p: + .quad _ge_add_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_ge_add_p, @object -.size fe_ge_add_p,8 -fe_ge_add_p: - .quad fe_ge_add_x64 +.type ge_sub_p, @object +.size ge_sub_p,8 +ge_sub_p: + .quad ge_sub_x64 #else .section __DATA,__data .p2align 2 -_fe_ge_add_p: - .quad _fe_ge_add_x64 +_ge_sub_p: + .quad _ge_sub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data -.type fe_ge_sub_p, @object -.size fe_ge_sub_p,8 -fe_ge_sub_p: - .quad fe_ge_sub_x64 +.type sc_reduce_p, @object +.size sc_reduce_p,8 +sc_reduce_p: + .quad sc_reduce_x64 #else .section __DATA,__data .p2align 2 -_fe_ge_sub_p: - .quad _fe_ge_sub_x64 +_sc_reduce_p: + .quad _sc_reduce_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ +.data +.type sc_muladd_p, @object +.size sc_muladd_p,8 +sc_muladd_p: + .quad sc_muladd_x64 +#else +.section __DATA,__data +.p2align 2 +_sc_muladd_p: + .quad _sc_muladd_x64 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifndef __APPLE__ .text .globl fe_mul_x64 .type fe_mul_x64,@function @@ -1470,55 +1581,43 @@ _fe_mul_x64: mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbx, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbx + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbx, %r8 + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax + movq $0x7fffffffffffffff, %rbx + movq %r11, %rax + sarq $63, %rax + andq $19, %rax andq %rbx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) @@ -1621,55 +1720,43 @@ _fe_sq_x64: addq %r15, %r12 adcq $0x00, %r13 adcq $0x00, %r14 - # Reduce + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %r15 - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - shldq $0x01, %r10, %r11 + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx andq %r15, %r10 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %r15 + movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax adcq %rdx, %r13 - mulq %r14 - # Add remaining product results in - addq %r11, %r8 + addq %r15, %rcx + adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 - adcq %rax, %r10 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r10, %rdx - imulq $19, %rdx, %rax + movq $0x7fffffffffffffff, %r15 + movq %r10, %rax + sarq $63, %rax + andq $19, %rax andq %r15, %r10 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 - # Reduce if top bit set - movq %r10, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %r15, %r10 - addq %rdx, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) @@ -1774,55 +1861,34 @@ L_fe_sq_n_x64: addq %rbx, %r13 adcq $0x00, %r14 adcq $0x00, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbx, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbx + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbx, %r8 + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) @@ -1892,178 +1958,15 @@ _fe_mul121666_x64: #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_sq2_x64 -.type fe_sq2_x64,@function +.globl fe_invert_x64 +.type fe_invert_x64,@function .align 16 -fe_sq2_x64: +fe_invert_x64: #else .section __TEXT,__text -.globl _fe_sq2_x64 +.globl _fe_invert_x64 .p2align 4 -_fe_sq2_x64: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - # Square * 2 - # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r8 - movq %rdx, %r9 - # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) - xorq %r13, %r13 - addq %rax, %r12 - adcq %rdx, %r13 - # Double - xorq %r14, %r14 - addq %r8, %r8 - adcq %r9, %r9 - adcq %r10, %r10 - adcq %r11, %r11 - adcq %r12, %r12 - adcq %r13, %r13 - adcq $0x00, %r14 - # A[0] * A[0] - movq (%rsi), %rax - mulq %rax - movq %rax, %rcx - movq %rdx, %r15 - # A[1] * A[1] - movq 8(%rsi), %rax - mulq %rax - addq %r15, %r8 - adcq %rax, %r9 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[2] * A[2] - movq 16(%rsi), %rax - mulq %rax - addq %r15, %r10 - adcq %rax, %r11 - adcq $0x00, %rdx - movq %rdx, %r15 - # A[3] * A[3] - movq 24(%rsi), %rax - mulq %rax - addq %rax, %r13 - adcq %rdx, %r14 - addq %r15, %r12 - adcq $0x00, %r13 - adcq $0x00, %r14 - # Reduce - movq $0x7fffffffffffffff, %rbx - xorq %rax, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $3, %r14, %rax - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $2, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shldq $0x01, %rcx, %r8 - shlq $0x01, %rcx - andq %rbx, %r10 - # Two out left, one in right - andq %rbx, %r14 - # Multiply top bits by 19*19 - imulq $0x169, %rax, %r15 - # Multiply top half by 19 - movq $19, %rax - mulq %r11 - xorq %r11, %r11 - addq %rax, %rcx - movq $19, %rax - adcq %rdx, %r11 - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - # Add remaining produce results in - addq %r15, %rcx - adcq %r11, %r8 - adcq %r12, %r9 - adcq %r13, %r10 - adcq %rax, %r10 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r10, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r10 - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - # Reduce if top bit set - movq %r10, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r10 - addq %rdx, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - # Store - movq %rcx, (%rdi) - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size fe_sq2_x64,.-fe_sq2_x64 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_invert_x64 -.type fe_invert_x64,@function -.align 16 -fe_invert_x64: -#else -.section __TEXT,__text -.globl _fe_invert_x64 -.p2align 4 -_fe_invert_x64: +_fe_invert_x64: #endif /* __APPLE__ */ subq $0x90, %rsp # Invert @@ -2328,9 +2231,9 @@ _curve25519_x64: pushq %rbx pushq %rbp movq %rdx, %r8 - subq $0xb8, %rsp + subq $0xb0, %rsp xorq %rbx, %rbx - movq %rdi, 176(%rsp) + movq %rdi, 168(%rsp) # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) @@ -2355,12 +2258,12 @@ _curve25519_x64: movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) - movb $62, 168(%rsp) - movq $3, 160(%rsp) -L_curve25519_x64_words: + movq $0xfe, %r9 L_curve25519_x64_bits: - movq 160(%rsp), %r9 - movb 168(%rsp), %cl + movq %r9, 160(%rsp) + movq %r9, %rcx + andq $63, %rcx + shrq $6, %r9 movq (%rsi,%r9,8), %rbp shrq %cl, %rbp andq $0x01, %rbp @@ -2409,48 +2312,45 @@ L_curve25519_x64_bits: xorq %r10, 48(%rsp) xorq %r11, 56(%rsp) movq %rbp, %rbx + # Add-Sub # Add movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 - movq 24(%rdi), %rbp + movq 24(%rdi), %r11 movq %rcx, %r12 addq (%rsp), %rcx movq %r9, %r13 adcq 8(%rsp), %r9 movq %r10, %r14 adcq 16(%rsp), %r10 - movq %rbp, %r15 - adcq 24(%rsp), %rbp - movq $-19, %rax - movq %rbp, %r11 + movq %r11, %r15 + adcq 24(%rsp), %r11 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r11, %rax movq $0x7fffffffffffffff, %rdx - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + imulq $19, %rax + andq %rdx, %r11 # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 # Sub subq (%rsp), %r12 - movq $0x00, %rbp sbbq 8(%rsp), %r13 - movq $-19, %rax sbbq 16(%rsp), %r14 - movq $0x7fffffffffffffff, %rdx sbbq 24(%rsp), %r15 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + sbbq %rax, %rax + shldq $0x01, %r15, %rax + imulq $-19, %rax + andq %rdx, %r15 # Add modulus (if underflow) - addq %rax, %r12 - adcq %rbp, %r13 - adcq %rbp, %r14 - adcq %rdx, %r15 + subq %rax, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) @@ -2459,204 +2359,180 @@ L_curve25519_x64_bits: movq %r13, 136(%rsp) movq %r14, 144(%rsp) movq %r15, 152(%rsp) + # Add-Sub # Add movq 64(%rsp), %rcx movq 72(%rsp), %r9 movq 80(%rsp), %r10 - movq 88(%rsp), %rbp + movq 88(%rsp), %r11 movq %rcx, %r12 addq 32(%rsp), %rcx movq %r9, %r13 adcq 40(%rsp), %r9 movq %r10, %r14 adcq 48(%rsp), %r10 - movq %rbp, %r15 - adcq 56(%rsp), %rbp - movq $-19, %rax - movq %rbp, %r11 + movq %r11, %r15 + adcq 56(%rsp), %r11 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r11, %rax movq $0x7fffffffffffffff, %rdx - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + imulq $19, %rax + andq %rdx, %r11 # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 # Sub subq 32(%rsp), %r12 - movq $0x00, %rbp sbbq 40(%rsp), %r13 - movq $-19, %rax sbbq 48(%rsp), %r14 - movq $0x7fffffffffffffff, %rdx sbbq 56(%rsp), %r15 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + sbbq %rax, %rax + shldq $0x01, %r15, %rax + imulq $-19, %rax + andq %rdx, %r15 # Add modulus (if underflow) - addq %rax, %r12 - adcq %rbp, %r13 - adcq %rbp, %r14 - adcq %rdx, %r15 - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + subq %rax, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + movq %rcx, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) movq %r12, 96(%rsp) movq %r13, 104(%rsp) movq %r14, 112(%rsp) movq %r15, 120(%rsp) # Multiply # A[0] * B[0] - movq (%rdi), %rax - mulq 96(%rsp) + movq 128(%rsp), %rax + mulq 32(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] - movq 8(%rdi), %rax - mulq 96(%rsp) + movq 136(%rsp), %rax + mulq 32(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] - movq (%rdi), %rax - mulq 104(%rsp) + movq 128(%rsp), %rax + mulq 40(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] - movq 16(%rdi), %rax - mulq 96(%rsp) + movq 144(%rsp), %rax + mulq 32(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] - movq 8(%rdi), %rax - mulq 104(%rsp) + movq 136(%rsp), %rax + mulq 40(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] - movq (%rdi), %rax - mulq 112(%rsp) + movq 128(%rsp), %rax + mulq 48(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] - movq 24(%rdi), %rax - mulq 96(%rsp) + movq 152(%rsp), %rax + mulq 32(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] - movq 16(%rdi), %rax - mulq 104(%rsp) + movq 144(%rsp), %rax + mulq 40(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] - movq 8(%rdi), %rax - mulq 112(%rsp) + movq 136(%rsp), %rax + mulq 48(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] - movq (%rdi), %rax - mulq 120(%rsp) + movq 128(%rsp), %rax + mulq 56(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] - movq 24(%rdi), %rax - mulq 104(%rsp) + movq 152(%rsp), %rax + mulq 40(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] - movq 16(%rdi), %rax - mulq 112(%rsp) + movq 144(%rsp), %rax + mulq 48(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] - movq 8(%rdi), %rax - mulq 120(%rsp) + movq 136(%rsp), %rax + mulq 56(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] - movq 24(%rdi), %rax - mulq 112(%rsp) + movq 152(%rsp), %rax + mulq 48(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] - movq 16(%rdi), %rax - mulq 120(%rsp) + movq 144(%rsp), %rax + mulq 56(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] - movq 24(%rdi), %rax - mulq 120(%rsp) + movq 152(%rsp), %rax + mulq 56(%rsp) addq %rax, %r14 adcq %rdx, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, 32(%rsp) movq %r9, 40(%rsp) @@ -2664,152 +2540,131 @@ L_curve25519_x64_bits: movq %r11, 56(%rsp) # Multiply # A[0] * B[0] - movq 128(%rsp), %rax - mulq (%rsp) + movq (%rdi), %rax + mulq 96(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] - movq 136(%rsp), %rax - mulq (%rsp) + movq 8(%rdi), %rax + mulq 96(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] - movq 128(%rsp), %rax - mulq 8(%rsp) + movq (%rdi), %rax + mulq 104(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] - movq 144(%rsp), %rax - mulq (%rsp) + movq 16(%rdi), %rax + mulq 96(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] - movq 136(%rsp), %rax - mulq 8(%rsp) + movq 8(%rdi), %rax + mulq 104(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] - movq 128(%rsp), %rax - mulq 16(%rsp) + movq (%rdi), %rax + mulq 112(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] - movq 152(%rsp), %rax - mulq (%rsp) + movq 24(%rdi), %rax + mulq 96(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] - movq 144(%rsp), %rax - mulq 8(%rsp) + movq 16(%rdi), %rax + mulq 104(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] - movq 136(%rsp), %rax - mulq 16(%rsp) + movq 8(%rdi), %rax + mulq 112(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] - movq 128(%rsp), %rax - mulq 24(%rsp) + movq (%rdi), %rax + mulq 120(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] - movq 152(%rsp), %rax - mulq 8(%rsp) + movq 24(%rdi), %rax + mulq 104(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] - movq 144(%rsp), %rax - mulq 16(%rsp) + movq 16(%rdi), %rax + mulq 112(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] - movq 136(%rsp), %rax - mulq 24(%rsp) + movq 8(%rdi), %rax + mulq 120(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] - movq 152(%rsp), %rax - mulq 16(%rsp) + movq 24(%rdi), %rax + mulq 112(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] - movq 144(%rsp), %rax - mulq 24(%rsp) + movq 16(%rdi), %rax + mulq 120(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] - movq 152(%rsp), %rax - mulq 24(%rsp) + movq 24(%rdi), %rax + mulq 120(%rsp) addq %rax, %r14 adcq %rdx, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) @@ -2887,55 +2742,34 @@ L_curve25519_x64_bits: addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, 96(%rsp) movq %r9, 104(%rsp) @@ -3013,110 +2847,86 @@ L_curve25519_x64_bits: addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) + # Add-Sub # Add - movq 32(%rsp), %rcx - movq 40(%rsp), %r9 - movq 48(%rsp), %r10 - movq 56(%rsp), %rbp + movq (%rsp), %rcx + movq 8(%rsp), %r9 + movq 16(%rsp), %r10 + movq 24(%rsp), %r11 movq %rcx, %r12 - addq (%rsp), %rcx + addq 32(%rsp), %rcx movq %r9, %r13 - adcq 8(%rsp), %r9 + adcq 40(%rsp), %r9 movq %r10, %r14 - adcq 16(%rsp), %r10 - movq %rbp, %r15 - adcq 24(%rsp), %rbp - movq $-19, %rax - movq %rbp, %r11 + adcq 48(%rsp), %r10 + movq %r11, %r15 + adcq 56(%rsp), %r11 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r11, %rax movq $0x7fffffffffffffff, %rdx - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + imulq $19, %rax + andq %rdx, %r11 # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 # Sub - subq (%rsp), %r12 - movq $0x00, %rbp - sbbq 8(%rsp), %r13 - movq $-19, %rax - sbbq 16(%rsp), %r14 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rsp), %r15 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + subq 32(%rsp), %r12 + sbbq 40(%rsp), %r13 + sbbq 48(%rsp), %r14 + sbbq 56(%rsp), %r15 + sbbq %rax, %rax + shldq $0x01, %r15, %rax + imulq $-19, %rax + andq %rdx, %r15 # Add modulus (if underflow) - addq %rax, %r12 - adcq %rbp, %r13 - adcq %rbp, %r14 - adcq %rdx, %r15 + subq %rax, %r12 + sbbq $0x00, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 movq %rcx, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) - movq %r12, (%rsp) - movq %r13, 8(%rsp) - movq %r14, 16(%rsp) - movq %r15, 24(%rsp) + movq %r12, 32(%rsp) + movq %r13, 40(%rsp) + movq %r14, 48(%rsp) + movq %r15, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax @@ -3216,55 +3026,34 @@ L_curve25519_x64_bits: mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) @@ -3276,58 +3065,56 @@ L_curve25519_x64_bits: movq 144(%rsp), %r10 movq 152(%rsp), %r11 subq 96(%rsp), %rcx - movq $0x00, %rbp sbbq 104(%rsp), %r9 - movq $-19, %rax sbbq 112(%rsp), %r10 - movq $0x7fffffffffffffff, %rdx sbbq 120(%rsp), %r11 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + sbbq %rax, %rax + shldq $0x01, %r11, %rax + movq $0x7fffffffffffffff, %rdx + imulq $-19, %rax + andq %rdx, %r11 # Add modulus (if underflow) - addq %rax, %rcx - adcq %rbp, %r9 - adcq %rbp, %r10 - adcq %rdx, %r11 + subq %rax, %rcx + sbbq $0x00, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Square # A[0] * A[1] - movq (%rsp), %rax - mulq 8(%rsp) + movq 32(%rsp), %rax + mulq 40(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] - movq (%rsp), %rax - mulq 16(%rsp) + movq 32(%rsp), %rax + mulq 48(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] - movq (%rsp), %rax - mulq 24(%rsp) + movq 32(%rsp), %rax + mulq 56(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] - movq 8(%rsp), %rax - mulq 16(%rsp) + movq 40(%rsp), %rax + mulq 48(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] - movq 8(%rsp), %rax - mulq 24(%rsp) + movq 40(%rsp), %rax + mulq 56(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] - movq 16(%rsp), %rax - mulq 24(%rsp) + movq 48(%rsp), %rax + mulq 56(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 @@ -3341,86 +3128,65 @@ L_curve25519_x64_bits: adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] - movq (%rsp), %rax + movq 32(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] - movq 8(%rsp), %rax + movq 40(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] - movq 16(%rsp), %rax + movq 48(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] - movq 24(%rsp), %rax + movq 56(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + movq %rcx, 32(%rsp) + movq %r9, 40(%rsp) + movq %r10, 48(%rsp) + movq %r11, 56(%rsp) # Multiply by 121666 movq $0x1db42, %rax mulq 128(%rsp) @@ -3450,10 +3216,10 @@ L_curve25519_x64_bits: adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - movq %rcx, 32(%rsp) - movq %r9, 40(%rsp) - movq %r10, 48(%rsp) - movq %r11, 56(%rsp) + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) # Square # A[0] * A[1] movq 64(%rsp), %rax @@ -3526,55 +3292,34 @@ L_curve25519_x64_bits: addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, 64(%rsp) movq %r9, 72(%rsp) @@ -3583,176 +3328,154 @@ L_curve25519_x64_bits: # Add movq 96(%rsp), %rcx movq 104(%rsp), %r9 - addq 32(%rsp), %rcx + addq (%rsp), %rcx movq 112(%rsp), %r10 - adcq 40(%rsp), %r9 - movq 120(%rsp), %rbp - adcq 48(%rsp), %r10 - movq $-19, %rax - adcq 56(%rsp), %rbp + adcq 8(%rsp), %r9 + movq 120(%rsp), %r11 + adcq 16(%rsp), %r10 + adcq 24(%rsp), %r11 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r11, %rax movq $0x7fffffffffffffff, %rdx - movq %rbp, %r11 - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx + imulq $19, %rax + andq %rdx, %r11 # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 + addq %rax, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Multiply # A[0] * B[0] - movq (%rsp), %rax + movq 32(%rsp), %rax mulq (%r8) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] - movq 8(%rsp), %rax + movq 40(%rsp), %rax mulq (%r8) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] - movq (%rsp), %rax + movq 32(%rsp), %rax mulq 8(%r8) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] - movq 16(%rsp), %rax + movq 48(%rsp), %rax mulq (%r8) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] - movq 8(%rsp), %rax + movq 40(%rsp), %rax mulq 8(%r8) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] - movq (%rsp), %rax + movq 32(%rsp), %rax mulq 16(%r8) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] - movq 24(%rsp), %rax + movq 56(%rsp), %rax mulq (%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] - movq 16(%rsp), %rax + movq 48(%rsp), %rax mulq 8(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] - movq 8(%rsp), %rax + movq 40(%rsp), %rax mulq 16(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] - movq (%rsp), %rax + movq 32(%rsp), %rax mulq 24(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] - movq 24(%rsp), %rax + movq 56(%rsp), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] - movq 16(%rsp), %rax + movq 48(%rsp), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] - movq 8(%rsp), %rax + movq 40(%rsp), %rax mulq 24(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] - movq 24(%rsp), %rax + movq 56(%rsp), %rax mulq 16(%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] - movq 16(%rsp), %rax + movq 48(%rsp), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] - movq 24(%rsp), %rax + movq 56(%rsp), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, 32(%rsp) movq %r9, 40(%rsp) @@ -3857,65 +3580,42 @@ L_curve25519_x64_bits: mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbp, %r11 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbp, %r11 - addq %rdx, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) - decb 168(%rsp) + movq 160(%rsp), %r9 + decq %r9 jge L_curve25519_x64_bits - movq $63, 168(%rsp) - decb 160(%rsp) - jge L_curve25519_x64_words # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi @@ -4153,7 +3853,7 @@ L_curve25519_x64_bits: #else callq _fe_mul_x64 #endif /* __APPLE__ */ - movq 176(%rsp), %rdi + movq 168(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rsp), %rax @@ -4253,77 +3953,66 @@ L_curve25519_x64_bits: mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce + movq $38, %rax + mulq %r15 + addq %rax, %r11 + adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rdx + imulq $19, %rdx, %rdx andq %rbp, %r11 - # Multiply top half by 19 - movq $19, %rax + movq %rdx, %rbp + movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx - movq $19, %rax + movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 + addq %rbp, %rcx + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax + movq $0x7fffffffffffffff, %rbp + movq %r11, %rax + sarq $63, %rax + andq $19, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - # Reduce if top bit set + movq $0x7fffffffffffffff, %rax + movq %rcx, %rdx + addq $19, %rdx + movq %r9, %rdx + adcq $0x00, %rdx + movq %r10, %rdx + adcq $0x00, %rdx movq %r11, %rdx + adcq $0x00, %rdx sarq $63, %rdx andq $19, %rdx - andq %rbp, %r11 + andq %rax, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - movq %rcx, %rax - addq $19, %rax - movq %r9, %rax - adcq $0x00, %rax - movq %r10, %rax - adcq $0x00, %rax - movq %r11, %rax - adcq $0x00, %rax - sarq $63, %rax - andq $19, %rax - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - andq %rbp, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) xorq %rax, %rax - addq $0xb8, %rsp + addq $0xb0, %rsp popq %rbp popq %rbx popq %r15 @@ -4334,94 +4023,237 @@ L_curve25519_x64_bits: #ifndef __APPLE__ .size curve25519_x64,.-curve25519_x64 #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text -.globl fe_pow22523_x64 -.type fe_pow22523_x64,@function +.globl fe_sq2_x64 +.type fe_sq2_x64,@function .align 16 -fe_pow22523_x64: +fe_sq2_x64: #else .section __TEXT,__text -.globl _fe_pow22523_x64 +.globl _fe_sq2_x64 .p2align 4 -_fe_pow22523_x64: -#endif /* __APPLE__ */ - subq $0x70, %rsp - # pow22523 - movq %rdi, 96(%rsp) - movq %rsi, 104(%rsp) - movq %rsp, %rdi - movq 104(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq 104(%rsp), %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi - leaq 32(%rsp), %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt -#else - callq _fe_mul_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - movq %rsp, %rsi -#ifndef __APPLE__ - callq fe_sq_x64@plt -#else - callq _fe_sq_x64 -#endif /* __APPLE__ */ - leaq 32(%rsp), %rdi - leaq 32(%rsp), %rsi - movq $4, %rdx -#ifndef __APPLE__ - callq fe_sq_n_x64@plt -#else - callq _fe_sq_n_x64 +_fe_sq2_x64: #endif /* __APPLE__ */ - movq %rsp, %rdi - leaq 32(%rsp), %rsi - movq %rsp, %rdx -#ifndef __APPLE__ - callq fe_mul_x64@plt + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + # Square * 2 + # A[0] * A[1] + movq (%rsi), %rax + mulq 8(%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * A[2] + movq (%rsi), %rax + mulq 16(%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * A[2] + movq 8(%rsi), %rax + mulq 16(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[1] * A[3] + movq 8(%rsi), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + # A[2] * A[3] + movq 16(%rsi), %rax + mulq 24(%rsi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 + # Double + xorq %r14, %r14 + addq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq $0x00, %r14 + # A[0] * A[0] + movq (%rsi), %rax + mulq %rax + movq %rax, %rcx + movq %rdx, %r15 + # A[1] * A[1] + movq 8(%rsi), %rax + mulq %rax + addq %r15, %r8 + adcq %rax, %r9 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[2] * A[2] + movq 16(%rsi), %rax + mulq %rax + addq %r15, %r10 + adcq %rax, %r11 + adcq $0x00, %rdx + movq %rdx, %r15 + # A[3] * A[3] + movq 24(%rsi), %rax + mulq %rax + addq %rax, %r13 + adcq %rdx, %r14 + addq %r15, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + movq $38, %rax + mulq %r14 + addq %rax, %r10 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r15 + shldq $0x01, %r10, %rdx + imulq $19, %rdx, %rdx + andq %r15, %r10 + movq %rdx, %r15 + movq $38, %rax + mulq %r11 + xorq %r11, %r11 + addq %rax, %rcx + movq $38, %rax + adcq %rdx, %r11 + mulq %r12 + xorq %r12, %r12 + addq %rax, %r8 + movq $38, %rax + adcq %rdx, %r12 + mulq %r13 + xorq %r13, %r13 + addq %rax, %r9 + adcq %rdx, %r13 + addq %r15, %rcx + adcq %r11, %r8 + adcq %r12, %r9 + adcq %r13, %r10 + mov %r10, %rax + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shldq $0x01, %rcx, %r8 + shlq $0x01, %rcx + movq $0x7fffffffffffffff, %r15 + shrq $62, %rax + andq %r15, %r10 + imulq $19, %rax, %rax + addq %rax, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + # Store + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_sq2_x64,.-fe_sq2_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl fe_pow22523_x64 +.type fe_pow22523_x64,@function +.align 16 +fe_pow22523_x64: +#else +.section __TEXT,__text +.globl _fe_pow22523_x64 +.p2align 4 +_fe_pow22523_x64: +#endif /* __APPLE__ */ + subq $0x70, %rsp + # pow22523 + movq %rdi, 96(%rsp) + movq %rsi, 104(%rsp) + movq %rsp, %rdi + movq 104(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq 104(%rsp), %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi + leaq 32(%rsp), %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt +#else + callq _fe_mul_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + movq %rsp, %rsi +#ifndef __APPLE__ + callq fe_sq_x64@plt +#else + callq _fe_sq_x64 +#endif /* __APPLE__ */ + leaq 32(%rsp), %rdi + leaq 32(%rsp), %rsi + movq $4, %rdx +#ifndef __APPLE__ + callq fe_sq_n_x64@plt +#else + callq _fe_sq_n_x64 +#endif /* __APPLE__ */ + movq %rsp, %rdi + leaq 32(%rsp), %rsi + movq %rsp, %rdx +#ifndef __APPLE__ + callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ @@ -4591,5069 +4423,4784 @@ _fe_pow22523_x64: repz retq #ifndef __APPLE__ .text -.globl fe_ge_to_p2_x64 -.type fe_ge_to_p2_x64,@function +.globl ge_p1p1_to_p2_x64 +.type ge_p1p1_to_p2_x64,@function .align 16 -fe_ge_to_p2_x64: +ge_p1p1_to_p2_x64: #else .section __TEXT,__text -.globl _fe_ge_to_p2_x64 +.globl _ge_p1p1_to_p2_x64 .p2align 4 -_fe_ge_to_p2_x64: +_ge_p1p1_to_p2_x64: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 16(%rsp), %rsi - movq 88(%rsp), %rbx + pushq %rbx + subq $16, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rsi, %rcx + addq $0x60, %rcx # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r9 + movq %rdx, %r10 # A[0] * B[1] - movq 8(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax + # A[1] * B[1] + movq 8(%rcx), %rax mulq 8(%rsi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[1] + movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[1] + movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[2] + movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $0x40, %rsi + addq $0x40, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r9 + movq %rdx, %r10 # A[0] * B[1] - movq 8(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax + # A[1] * B[1] + movq 8(%rcx), %rax mulq 8(%rsi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[1] + movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[1] + movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[2] + movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 32(%rsp), %rsi - movq 88(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + movq %rsi, %rcx + subq $32, %rcx + subq $32, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r9 + movq %rdx, %r10 # A[0] * B[1] - movq 8(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax + # A[1] * B[1] + movq 8(%rcx), %rax mulq 8(%rsi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[1] + movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[1] + movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[2] + movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $16, %rsp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64 +.size ge_p1p1_to_p2_x64,.-ge_p1p1_to_p2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_to_p3_x64 -.type fe_ge_to_p3_x64,@function +.globl ge_p1p1_to_p3_x64 +.type ge_p1p1_to_p3_x64,@function .align 16 -fe_ge_to_p3_x64: +ge_p1p1_to_p3_x64: #else .section __TEXT,__text -.globl _fe_ge_to_p3_x64 +.globl _ge_p1p1_to_p3_x64 .p2align 4 -_fe_ge_to_p3_x64: +_ge_p1p1_to_p3_x64: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 24(%rsp), %rsi - movq 96(%rsp), %rbx + pushq %rbx + subq $16, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rsi, %rcx + addq $0x60, %rcx # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r9 + movq %rdx, %r10 # A[0] * B[1] - movq 8(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax + # A[1] * B[1] + movq 8(%rcx), %rax mulq 8(%rsi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[1] + movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[1] + movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[2] + movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 32(%rsp), %rsi - movq 88(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + movq %rsi, %rcx + addq $32, %rcx + addq $0x60, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r9 + movq %rdx, %r10 # A[0] * B[1] - movq 8(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax mulq (%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[1] + movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[1] + movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[2] + movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 88(%rsp), %rsi - movq 96(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $0x40, %rsi + subq $0x40, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r9 + movq %rdx, %r10 # A[0] * B[1] - movq 8(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax + # A[1] * B[1] + movq 8(%rcx), %rax mulq 8(%rsi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[1] + movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[1] + movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[2] + movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + movq %rsi, %rcx + addq $32, %rcx + addq $32, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r9 + movq %rdx, %r10 # A[0] * B[1] - movq 8(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax + # A[1] * B[1] + movq 8(%rcx), %rax mulq 8(%rsi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[1] + movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[1] + movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[2] + movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $16, %rsp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64 +.size ge_p1p1_to_p3_x64,.-ge_p1p1_to_p3_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_dbl_x64 -.type fe_ge_dbl_x64,@function +.globl ge_p2_dbl_x64 +.type ge_p2_dbl_x64,@function .align 16 -fe_ge_dbl_x64: +ge_p2_dbl_x64: #else .section __TEXT,__text -.globl _fe_ge_dbl_x64 +.globl _ge_p2_dbl_x64 .p2align 4 -_fe_ge_dbl_x64: +_ge_p2_dbl_x64: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $0x50, %rsp + pushq %rbx + subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 32(%rsp), %rsi + addq $0x40, %rdi # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 + movq %rax, %r10 + movq %rdx, %r11 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 + addq %rax, %r13 + adcq %rdx, %r14 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 + addq %rax, %r14 + adcq %rdx, %r15 + # Double + xorq %rbx, %rbx + addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 - adcq $0x00, %r15 + adcq %r15, %r15 + adcq $0x00, %rbx # A[0] * A[0] movq (%rsi), %rax mulq %rax - movq %rax, %r8 - movq %rdx, %rcx + movq %rax, %r9 + movq %rdx, %r8 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 + addq %r8, %r10 + adcq %rax, %r11 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 + addq %r8, %r12 + adcq %rax, %r13 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 + addq %rax, %r15 + adcq %rdx, %rbx + addq %r8, %r14 adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 40(%rsp), %rsi + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $32, %rsi # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 + movq %rax, %r10 + movq %rdx, %r11 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 + # A[0] * A[3] + movq (%rsi), %rax + mulq 24(%rsi) + xorq %r13, %r13 + addq %rax, %r12 + adcq %rdx, %r13 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 + addq %rax, %r13 + adcq %rdx, %r14 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) - xorq %r14, %r14 - addq %rax, %r13 - adcq %rdx, %r14 - # Double xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 + addq %rax, %r14 + adcq %rdx, %r15 + # Double + xorq %rbx, %rbx + addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 - adcq $0x00, %r15 + adcq %r15, %r15 + adcq $0x00, %rbx # A[0] * A[0] movq (%rsi), %rax mulq %rax - movq %rax, %r8 - movq %rdx, %rcx + movq %rax, %r9 + movq %rdx, %r8 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 + addq %r8, %r10 + adcq %rax, %r11 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 + addq %r8, %r12 + adcq %rax, %r13 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 + addq %rax, %r15 + adcq %rdx, %rbx + addq %r8, %r14 adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 + adcq %r15, %r12 + # Store + movq %rdi, %rsi + subq $32, %rdi + # Add-Sub + # Add + movq %r9, %r13 + addq (%rsi), %r9 + movq %r10, %r14 + adcq 8(%rsi), %r10 + movq %r11, %r15 + adcq 16(%rsi), %r11 + movq %r12, %rbx + adcq 24(%rsi), %r12 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r12, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r12 + # Sub modulus (if overflow) + addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 + adcq $0x00, %r12 + # Sub + subq (%rsi), %r13 + sbbq 8(%rsi), %r14 + sbbq 16(%rsi), %r15 + sbbq 24(%rsi), %rbx + sbbq %rax, %rax + shldq $0x01, %rbx, %rax + imulq $-19, %rax + andq %rdx, %rbx + # Add modulus (if underflow) + subq %rax, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + movq %r13, (%rsi) + movq %r14, 8(%rsi) + movq %r15, 16(%rsi) + movq %rbx, 24(%rsi) + movq 8(%rsp), %rcx + movq %rcx, %rsi + addq $32, %rsi + subq $32, %rdi + # Add + movq (%rsi), %r9 + movq 8(%rsi), %r10 + addq (%rcx), %r9 + movq 16(%rsi), %r11 + adcq 8(%rcx), %r10 + movq 24(%rsi), %r12 + adcq 16(%rcx), %r11 + adcq 24(%rcx), %r12 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r12, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r12 + # Sub modulus (if overflow) + addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 128(%rsp), %rsi - # Square * 2 + adcq $0x00, %r12 + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + # Square # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 + movq (%rdi), %rax + mulq 8(%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) + movq (%rdi), %rax + mulq 16(%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) + # A[0] * A[3] + movq (%rdi), %rax + mulq 24(%rdi) xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) + # A[1] * A[2] + movq 8(%rdi), %rax + mulq 16(%rdi) xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[1] * A[3] + movq 8(%rdi), %rax + mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 - # Double + # A[2] * A[3] + movq 16(%rdi), %rax + mulq 24(%rdi) xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 + addq %rax, %r14 + adcq %rdx, %r15 + # Double + xorq %rbx, %rbx + addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 - adcq $0x00, %r15 + adcq %r15, %r15 + adcq $0x00, %rbx # A[0] * A[0] - movq (%rsi), %rax + movq (%rdi), %rax mulq %rax - movq %rax, %r8 - movq %rdx, %rcx + movq %rax, %r9 + movq %rdx, %r8 # A[1] * A[1] - movq 8(%rsi), %rax + movq 8(%rdi), %rax mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 + addq %r8, %r10 + adcq %rax, %r11 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[2] * A[2] - movq 16(%rsi), %rax + movq 16(%rdi), %rax mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 + addq %r8, %r12 + adcq %rax, %r13 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[3] * A[3] - movq 24(%rsi), %rax + movq 24(%rdi), %rax mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 + addq %rax, %r15 + adcq %rdx, %rbx + addq %r8, %r14 adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbx - xorq %rax, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $3, %r15, %rax - shldq $2, %r14, %r15 - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $0x01, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shlq $0x01, %r8 - andq %rbx, %r11 - # Two out left, one in right - andq %rbx, %r15 - # Multiply top bits by 19*19 - imulq $0x169, %rax, %rcx - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in - addq %rcx, %r8 - adcq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 32(%rsp), %rsi - movq 40(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq %rdi, %rsi + addq $32, %rsi + # Sub + subq (%rsi), %r9 + sbbq 8(%rsi), %r10 + sbbq 16(%rsi), %r11 + sbbq 24(%rsi), %r12 + sbbq %rax, %rax + shldq $0x01, %r12, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq 8(%rsp), %rsi - # Square + imulq $-19, %rax + andq %rdx, %r12 + # Add modulus (if underflow) + subq %rax, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 + sbbq $0x00, %r12 + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $0x40, %rcx + # Square * 2 # A[0] * A[1] - movq (%rsi), %rax - mulq 8(%rsi) - movq %rax, %r9 - movq %rdx, %r10 + movq (%rcx), %rax + mulq 8(%rcx) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * A[2] - movq (%rsi), %rax - mulq 16(%rsi) - xorq %r11, %r11 - addq %rax, %r10 - adcq %rdx, %r11 - # A[0] * A[3] - movq (%rsi), %rax - mulq 24(%rsi) + movq (%rcx), %rax + mulq 16(%rcx) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 - # A[1] * A[2] - movq 8(%rsi), %rax - mulq 16(%rsi) + # A[0] * A[3] + movq (%rcx), %rax + mulq 24(%rcx) xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * A[3] - movq 8(%rsi), %rax - mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 - # A[2] * A[3] - movq 16(%rsi), %rax - mulq 24(%rsi) + # A[1] * A[2] + movq 8(%rcx), %rax + mulq 16(%rcx) xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[1] * A[3] + movq 8(%rcx), %rax + mulq 24(%rcx) addq %rax, %r13 adcq %rdx, %r14 - # Double + # A[2] * A[3] + movq 16(%rcx), %rax + mulq 24(%rcx) xorq %r15, %r15 - addq %r9, %r9 - adcq %r10, %r10 + addq %rax, %r14 + adcq %rdx, %r15 + # Double + xorq %rbx, %rbx + addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 - adcq $0x00, %r15 + adcq %r15, %r15 + adcq $0x00, %rbx # A[0] * A[0] - movq (%rsi), %rax + movq (%rcx), %rax mulq %rax - movq %rax, %r8 - movq %rdx, %rcx + movq %rax, %r9 + movq %rdx, %r8 # A[1] * A[1] - movq 8(%rsi), %rax + movq 8(%rcx), %rax mulq %rax - addq %rcx, %r9 - adcq %rax, %r10 + addq %r8, %r10 + adcq %rax, %r11 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[2] * A[2] - movq 16(%rsi), %rax + movq 16(%rcx), %rax mulq %rax - addq %rcx, %r11 - adcq %rax, %r12 + addq %r8, %r12 + adcq %rax, %r13 adcq $0x00, %rdx - movq %rdx, %rcx + movq %rdx, %r8 # A[3] * A[3] - movq 24(%rsi), %rax + movq 24(%rcx), %rax mulq %rax - addq %rax, %r14 - adcq %rdx, %r15 - addq %rcx, %r13 - adcq $0x00, %r14 + addq %rax, %r15 + adcq %rdx, %rbx + addq %r8, %r14 adcq $0x00, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 + adcq $0x00, %rbx + movq $38, %rax + mulq %rbx + addq %rax, %r12 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rdx + imulq $19, %rdx, %rdx + andq %r8, %r12 + movq %rdx, %r8 + movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 - movq $19, %rax + movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 + xorq %r15, %r15 + addq %rax, %r11 + adcq %rdx, %r15 + addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 + adcq %r15, %r12 + mov %r12, %rax + shldq $0x01, %r11, %r12 + shldq $0x01, %r10, %r11 + shldq $0x01, %r9, %r10 + shlq $0x01, %r9 + movq $0x7fffffffffffffff, %r8 + shrq $62, %rax + andq %r8, %r12 + imulq $19, %rax, %rax + addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 + adcq $0x00, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 16(%rsp), %rsi - movq (%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - leaq 48(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rbx + movq %rdi, %rsi + addq $0x40, %rsi + addq $0x60, %rdi # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 + subq (%rsi), %r9 + sbbq 8(%rsi), %r10 + sbbq 16(%rsi), %r11 + sbbq 24(%rsi), %r12 + sbbq %rax, %rax + shldq $0x01, %r12, %rax movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $-19, %rax + andq %rdx, %r12 # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp + subq %rax, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 + sbbq $0x00, %r12 + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $16, %rsp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_dbl_x64,.-fe_ge_dbl_x64 +.size ge_p2_dbl_x64,.-ge_p2_dbl_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_madd_x64 -.type fe_ge_madd_x64,@function +.globl ge_madd_x64 +.type ge_madd_x64,@function .align 16 -fe_ge_madd_x64: +ge_madd_x64: #else .section __TEXT,__text -.globl _fe_ge_madd_x64 +.globl _ge_madd_x64 .p2align 4 -_fe_ge_madd_x64: +_ge_madd_x64: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $0x50, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rcx + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + movq %rcx, 16(%rsp) + movq %rsi, %r8 + movq %rsi, %rcx + addq $32, %rcx + movq %rdi, %rsi + addq $32, %rsi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq (%rcx), %r10 + movq 8(%rcx), %r11 + movq 16(%rcx), %r12 + movq 24(%rcx), %r13 + movq %r10, %r14 + addq (%r8), %r10 + movq %r11, %r15 + adcq 8(%r8), %r11 + movq %r12, %rbx + adcq 16(%r8), %r12 + movq %r13, %rbp + adcq 24(%r8), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%r8), %r14 + sbbq 8(%r8), %r15 + sbbq 16(%r8), %rbx + sbbq 24(%r8), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 152(%rsp), %rbx + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rcx + addq $32, %rcx # Multiply # A[0] * B[0] - movq (%rbx), %rax + movq (%rcx), %rax mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax + movq 8(%rcx), %rax mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax + # A[1] * B[0] + movq (%rcx), %rax mulq 8(%rsi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + # A[1] * B[1] + movq 8(%rcx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax + # A[2] * B[0] + movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r12 - adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rsi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rsi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rsi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rsi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 160(%rsp), %rbx + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + addq $0x60, %r8 + addq $32, %rcx + addq $0x60, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%r8) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%r8) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%r8) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%r8) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%r8) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%r8) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%r8) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%r8) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 144(%rsp), %rsi - movq 136(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + subq $0x40, %rcx + subq $0x60, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 + movq 8(%rcx), %rax + mulq (%rdi) + xorq %r12, %r12 + addq %rax, %r11 + adcq %rdx, %r12 # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 + movq (%rcx), %rax + mulq 8(%rdi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) - xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + movq 16(%rcx), %rax + mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rdi) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rdi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rdi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rdi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq 128(%rsp), %rsi - movq 128(%rsp), %rbx + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + subq $32, %r8 + # Double + movq (%r8), %r10 + movq 8(%r8), %r11 + addq %r10, %r10 + movq 16(%r8), %r12 + adcq %r11, %r11 + movq 24(%r8), %r13 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %rdi, %rsi + addq $0x60, %rsi + addq $0x40, %rdi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + addq $24, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_madd_x64,.-fe_ge_madd_x64 +.size ge_madd_x64,.-ge_madd_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_msub_x64 -.type fe_ge_msub_x64,@function +.globl ge_msub_x64 +.type ge_msub_x64,@function .align 16 -fe_ge_msub_x64: +ge_msub_x64: #else .section __TEXT,__text -.globl _fe_ge_msub_x64 +.globl _ge_msub_x64 .p2align 4 -_fe_ge_msub_x64: +_ge_msub_x64: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $0x50, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rcx + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + movq %rcx, 16(%rsp) + movq %rsi, %r8 + movq %rsi, %rcx + addq $32, %rcx + movq %rdi, %rsi + addq $32, %rsi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq (%rcx), %r10 + movq 8(%rcx), %r11 + movq 16(%rcx), %r12 + movq 24(%rcx), %r13 + movq %r10, %r14 + addq (%r8), %r10 + movq %r11, %r15 + adcq 8(%r8), %r11 + movq %r12, %rbx + adcq 16(%r8), %r12 + movq %r13, %rbp + adcq 24(%r8), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%r8), %r14 + sbbq 8(%r8), %r15 + sbbq 16(%r8), %rbx + sbbq 24(%r8), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 160(%rsp), %rbx + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rcx + addq $32, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%rdi) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%rdi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rdi) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rdi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rdi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rdi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 152(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + addq $0x60, %r8 + addq $0x40, %rcx + addq $0x40, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%r8) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%r8) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%r8) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%r8) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) - addq %rax, %r14 - adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%r8) + xorq %rbx, %rbx + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%r8) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%r8) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%r8) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 144(%rsp), %rsi - movq 136(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + subq $32, %rcx + subq $0x60, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%rdi) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%rdi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rdi) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rdi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rdi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rdi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq 128(%rsp), %rsi - movq 128(%rsp), %rbx + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + subq $32, %r8 + addq $0x40, %rdi + # Double + movq (%r8), %r10 + movq 8(%r8), %r11 + addq %r10, %r10 + movq 16(%r8), %r12 + adcq %r11, %r11 + movq 24(%r8), %r13 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %rdi, %rsi + addq $32, %rsi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + # Sub + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp + # Add modulus (if underflow) + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + addq $24, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_msub_x64,.-fe_ge_msub_x64 +.size ge_msub_x64,.-ge_msub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_add_x64 -.type fe_ge_add_x64,@function +.globl ge_add_x64 +.type ge_add_x64,@function .align 16 -fe_ge_add_x64: +ge_add_x64: #else .section __TEXT,__text -.globl _fe_ge_add_x64 +.globl _ge_add_x64 .p2align 4 -_fe_ge_add_x64: +_ge_add_x64: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $0x50, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rcx + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + movq %rcx, 16(%rsp) + movq %rsi, %r8 + movq %rsi, %rcx + addq $32, %rcx + movq %rdi, %rsi + addq $32, %rsi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq (%rcx), %r10 + movq 8(%rcx), %r11 + movq 16(%rcx), %r12 + movq 24(%rcx), %r13 + movq %r10, %r14 + addq (%r8), %r10 + movq %r11, %r15 + adcq 8(%r8), %r11 + movq %r12, %rbx + adcq 16(%r8), %r12 + movq %r13, %rbp + adcq 24(%r8), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%r8), %r14 + sbbq 8(%r8), %r15 + sbbq 16(%r8), %rbx + sbbq 24(%r8), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 160(%rsp), %rbx + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rcx + addq $32, %rcx + addq $32, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%rdi) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%rdi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rdi) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rdi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rdi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rdi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 168(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + addq $0x60, %r8 + addq $0x40, %rcx + addq $0x40, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%r8) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%r8) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%r8) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%r8) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%r8) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%r8) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%r8) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%r8) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 152(%rsp), %rsi - movq 136(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + subq $0x60, %rcx + subq $0x60, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%rdi) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%rdi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rdi) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) - xorq %r15, %r15 + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rdi) + xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rdi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rdi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rdi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 128(%rsp), %rsi - movq 144(%rsp), %rbx + # Add-Sub + # Add + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r13 + # Sub modulus (if overflow) + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + # Sub + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp + # Add modulus (if underflow) + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + subq $32, %r8 + addq $0x40, %rcx # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%r8) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%r8) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%r8) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%r8) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%r8) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%r8) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%r8) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%r8) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq (%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + addq $0x40, %rdi + # Double + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %rdi, %rsi + addq $32, %rsi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $0x50, %rsp + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + addq $24, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_add_x64,.-fe_ge_add_x64 +.size ge_add_x64,.-ge_add_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_sub_x64 -.type fe_ge_sub_x64,@function +.globl ge_sub_x64 +.type ge_sub_x64,@function .align 16 -fe_ge_sub_x64: +ge_sub_x64: #else .section __TEXT,__text -.globl _fe_ge_sub_x64 +.globl _ge_sub_x64 .p2align 4 -_fe_ge_sub_x64: +_ge_sub_x64: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $0x50, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rcx + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + movq %rcx, 16(%rsp) + movq %rsi, %r8 + movq %rsi, %rcx + addq $32, %rcx + movq %rdi, %rsi + addq $32, %rsi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq (%rcx), %r10 + movq 8(%rcx), %r11 + movq 16(%rcx), %r12 + movq 24(%rcx), %r13 + movq %r10, %r14 + addq (%r8), %r10 + movq %r11, %r15 + adcq 8(%r8), %r11 + movq %r12, %rbx + adcq 16(%r8), %r12 + movq %r13, %rbp + adcq 24(%r8), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 40(%rsp), %rsi - movq 32(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%r8), %r14 + sbbq 8(%r8), %r15 + sbbq 16(%r8), %rbx + sbbq 24(%r8), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq (%rsp), %rsi - movq 168(%rsp), %rbx + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rcx + addq $32, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%rdi) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%rdi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rdi) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rdi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rdi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rdi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $38, %rax + adcq %rdx, %r14 + mulq %r15 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 + adcq %r14, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 8(%rsp), %rsi - movq 160(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + addq $0x60, %r8 + addq $0x60, %rcx + addq $0x40, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%r8) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%r8) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%r8) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%r8) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%r8) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%r8) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%r8) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%r8) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - movq 152(%rsp), %rsi - movq 136(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + subq $0x40, %rcx + subq $0x60, %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%rdi) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%rdi) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%rdi) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%rdi) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%rdi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%rdi) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%rdi) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%rdi) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%rdi) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 + adcq $0x00, %rdx + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 - movq $19, %rax + movq $38, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 adcq %r14, %r11 - adcq %rax, %r11 - adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 128(%rsp), %rsi - movq 144(%rsp), %rbx + # Add-Sub + # Add + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax + movq $0x7fffffffffffffff, %rdx + imulq $19, %rax + andq %rdx, %r13 + # Sub modulus (if overflow) + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + # Sub + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp + # Add modulus (if underflow) + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + subq $32, %r8 + addq $32, %rcx # Multiply # A[0] * B[0] - movq (%rbx), %rax - mulq (%rsi) - movq %rax, %r8 - movq %rdx, %r9 + movq (%rcx), %rax + mulq (%r8) + movq %rax, %r10 + movq %rdx, %r11 # A[0] * B[1] - movq 8(%rbx), %rax - mulq (%rsi) - xorq %r10, %r10 - addq %rax, %r9 - adcq %rdx, %r10 - # A[1] * B[0] - movq (%rbx), %rax - mulq 8(%rsi) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - adcq $0x00, %r11 - # A[0] * B[2] - movq 16(%rbx), %rax - mulq (%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - # A[1] * B[1] - movq 8(%rbx), %rax - mulq 8(%rsi) + movq 8(%rcx), %rax + mulq (%r8) xorq %r12, %r12 - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[2] * B[0] - movq (%rbx), %rax - mulq 16(%rsi) - addq %rax, %r10 - adcq %rdx, %r11 - adcq $0x00, %r12 - # A[0] * B[3] - movq 24(%rbx), %rax - mulq (%rsi) - xorq %r13, %r13 - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[1] * B[2] - movq 16(%rbx), %rax - mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 - adcq $0x00, %r13 - # A[2] * B[1] - movq 8(%rbx), %rax - mulq 16(%rsi) - addq %rax, %r11 - adcq %rdx, %r12 - adcq $0x00, %r13 - # A[3] * B[0] - movq (%rbx), %rax - mulq 24(%rsi) + # A[1] * B[0] + movq (%rcx), %rax + mulq 8(%r8) + xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 - # A[1] * B[3] - movq 24(%rbx), %rax - mulq 8(%rsi) - xorq %r14, %r14 + # A[0] * B[2] + movq 16(%rcx), %rax + mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 - adcq $0x00, %r14 - # A[2] * B[2] - movq 16(%rbx), %rax - mulq 16(%rsi) + # A[1] * B[1] + movq 8(%rcx), %rax + mulq 8(%r8) + xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[3] * B[1] - movq 8(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[0] + movq (%rcx), %rax + mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 - # A[2] * B[3] - movq 24(%rbx), %rax - mulq 16(%rsi) + # A[0] * B[3] + movq 24(%rcx), %rax + mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[2] - movq 16(%rbx), %rax - mulq 24(%rsi) + # A[1] * B[2] + movq 16(%rcx), %rax + mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 - # A[3] * B[3] - movq 24(%rbx), %rax - mulq 24(%rsi) + # A[2] * B[1] + movq 8(%rcx), %rax + mulq 16(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[0] + movq (%rcx), %rax + mulq 24(%r8) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[1] * B[3] + movq 24(%rcx), %rax + mulq 8(%r8) + xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rax - mulq %r12 - xorq %r12, %r12 - addq %rax, %r8 - movq $19, %rax - adcq %rdx, %r12 - mulq %r13 - xorq %r13, %r13 - addq %rax, %r9 - movq $19, %rax - adcq %rdx, %r13 - mulq %r14 - xorq %r14, %r14 - addq %rax, %r10 - movq $19, %rax - adcq %rdx, %r14 - mulq %r15 - # Add remaining product results in - addq %r12, %r9 - adcq %r13, %r10 - adcq %r14, %r11 - adcq %rax, %r11 + adcq $0x00, %rbx + # A[2] * B[2] + movq 16(%rcx), %rax + mulq 16(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[3] * B[1] + movq 8(%rcx), %rax + mulq 24(%r8) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x00, %rbx + # A[2] * B[3] + movq 24(%rcx), %rax + mulq 16(%r8) + xorq %rbp, %rbp + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[2] + movq 16(%rcx), %rax + mulq 24(%r8) + addq %rax, %r15 + adcq %rdx, %rbx + adcq $0x00, %rbp + # A[3] * B[3] + movq 24(%rcx), %rax + mulq 24(%r8) + addq %rax, %rbx + adcq %rdx, %rbp + movq $38, %rax + mulq %rbp + addq %rax, %r13 adcq $0x00, %rdx - # Overflow - shldq $0x01, %r11, %rdx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %rdx + imulq $19, %rdx, %rdx + andq %r9, %r13 + movq %rdx, %r9 + movq $38, %rax + mulq %r14 + xorq %r14, %r14 + addq %rax, %r10 + movq $38, %rax + adcq %rdx, %r14 + mulq %r15 + xorq %r15, %r15 + addq %rax, %r11 + movq $38, %rax + adcq %rdx, %r15 + mulq %rbx + xorq %rbx, %rbx + addq %rax, %r12 + adcq %rdx, %rbx + addq %r9, %r10 + adcq %r14, %r11 + adcq %r15, %r12 + adcq %rbx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rdi - movq (%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + # Double + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx - # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx - # Add modulus (if underflow) - addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 - adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 16(%rsp), %rsi - movq 8(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %rdi, %rsi + addq $0x40, %rsi + addq $0x60, %rdi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx + movq %r10, %r14 + addq (%rdi), %r10 + movq %r11, %r15 + adcq 8(%rdi), %r11 + movq %r12, %rbx + adcq 16(%rdi), %r12 + movq %r13, %rbp + adcq 24(%rdi), %r13 + movq $0x00, %rax + adcq $0x00, %rax + shldq $0x01, %r13, %rax movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + imulq $19, %rax + andq %rdx, %r13 # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx + addq %rax, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rbx), %r8 - movq $0x00, %rcx - sbbq 8(%rbx), %r9 - movq $-19, %rax - sbbq 16(%rbx), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rbx), %r11 - sbbq $0x00, %rcx - # Mask the modulus - andq %rcx, %rax - andq %rcx, %rdx + subq (%rdi), %r14 + sbbq 8(%rdi), %r15 + sbbq 16(%rdi), %rbx + sbbq 24(%rdi), %rbp + sbbq %rax, %rax + shldq $0x01, %rbp, %rax + imulq $-19, %rax + andq %rdx, %rbp # Add modulus (if underflow) + subq %rax, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + addq $24, %rsp + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size ge_sub_x64,.-ge_sub_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl sc_reduce_x64 +.type sc_reduce_x64,@function +.align 16 +sc_reduce_x64: +#else +.section __TEXT,__text +.globl _sc_reduce_x64 +.p2align 4 +_sc_reduce_x64: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + movq (%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + movq 32(%rdi), %r12 + movq 40(%rdi), %r13 + movq 48(%rdi), %r14 + movq 56(%rdi), %r15 + movq %r15, %rcx + movq $0xfffffffffffffff, %rsi + shrq $56, %rcx + shldq $4, %r14, %r15 + shldq $4, %r13, %r14 + shldq $4, %r12, %r13 + shldq $4, %r11, %r12 + andq %rsi, %r11 + andq %rsi, %r15 + # Add order times bits 504..511 + subq %rcx, %r14 + sbbq $0x00, %r15 + movq $0xeb2106215d086329, %rax + mulq %rcx + movq $0x00, %rsi + addq %rax, %r13 + movq $0xa7ed9ce5a30a2c13, %rax + adcq %rdx, %rsi + mulq %rcx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %rsi, %r14 + adcq $0x00, %r15 + # Sub product of top 4 words and order + movq $0xa7ed9ce5a30a2c13, %rcx + movq %r12, %rax + mulq %rcx + mov $0x00, %rbp addq %rax, %r8 - adcq %rcx, %r9 - adcq %rcx, %r10 + adcq %rdx, %rbp + movq %r13, %rax + mulq %rcx + movq $0x00, %rsi + addq %rax, %r9 + adcq %rdx, %rsi + movq %r14, %rax + mulq %rcx + addq %rbp, %r9 + adcq %rax, %r10 adcq %rdx, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rdi - leaq 48(%rsp), %rsi - movq 24(%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rcx - adcq 16(%rbx), %r10 - movq $-19, %rax - adcq 24(%rbx), %rcx - movq $0x7fffffffffffffff, %rdx - movq %rcx, %r11 - sarq $63, %rcx - # Mask the modulus + mov $0x00, %rbx + adcq $0x00, %rbx + movq %r15, %rax + mulq %rcx + addq %rsi, %r10 + adcq %rax, %r11 + adcq %rdx, %rbx + movq $0xeb2106215d086329, %rcx + movq %r12, %rax + mulq %rcx + mov $0x00, %rbp + addq %rax, %r9 + adcq %rdx, %rbp + movq %r13, %rax + mulq %rcx + movq $0x00, %rsi + addq %rax, %r10 + adcq %rdx, %rsi + movq %r14, %rax + mulq %rcx + addq %rbp, %r10 + adcq %rax, %r11 + adcq %rdx, %rbx + mov $0x00, %rbp + adcq $0x00, %rbp + movq %r15, %rax + mulq %rcx + addq %rsi, %r11 + adcq %rax, %rbx + adcq %rdx, %rbp + subq %r12, %r10 + movq %rbx, %r12 + sbbq %r13, %r11 + movq %rbp, %r13 + sbbq %r14, %r12 + sbbq %r15, %r13 + movq %r13, %rcx + sarq $57, %rcx + # Conditionally subtract order starting at bit 125 + movq $0xa000000000000000, %rax + movq $0xcb024c634b9eba7d, %rdx + movq $0x29bdf3bd45ef39a, %rbx + movq $0x200000000000000, %rbp andq %rcx, %rax andq %rcx, %rdx - # Sub modulus (if overflow) - subq %rax, %r8 - sbbq %rcx, %r9 - sbbq %rcx, %r10 - sbbq %rdx, %r11 + andq %rcx, %rbx + andq %rcx, %rbp + addq %rax, %r9 + adcq %rdx, %r10 + adcq %rbx, %r11 + adcq $0x00, %r12 + adcq %rbp, %r13 + # Move bits 252-376 to own registers + movq $0xfffffffffffffff, %rcx + shldq $4, %r12, %r13 + shldq $4, %r11, %r12 + andq %rcx, %r11 + # Sub product of top 2 words and order + # * -5812631a5cf5d3ed + movq $0xa7ed9ce5a30a2c13, %rcx + movq %r12, %rax + mulq %rcx + movq $0x00, %rbx + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rbx + movq %r13, %rax + mulq %rcx + addq %rax, %r9 + adcq %rdx, %rbx + # * -14def9dea2f79cd7 + movq $0xeb2106215d086329, %rcx + movq %r12, %rax + mulq %rcx + movq $0x00, %rbp + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %rbp + movq %r13, %rax + mulq %rcx + addq %rax, %r10 + adcq %rdx, %rbp + # Add overflows at 2 * 64 + movq $0xfffffffffffffff, %rsi + andq %rsi, %r11 + addq %rbx, %r10 + adcq %rbp, %r11 + # Subtract top at 2 * 64 + subq %r12, %r10 + sbbq %r13, %r11 + sbbq %rsi, %rsi + # Conditional sub order + movq $0x5812631a5cf5d3ed, %rax + movq $0x14def9dea2f79cd6, %rdx + movq $0x1000000000000000, %rbx + andq %rsi, %rax + andq %rsi, %rdx + andq %rsi, %rbx + addq %rax, %r8 + movq $0xfffffffffffffff, %rax + adcq %rdx, %r9 + adcq $0x00, %r10 + adcq %rbx, %r11 + andq %rax, %r11 + # Store result movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) - addq $0x50, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_sub_x64,.-fe_ge_sub_x64 +.size sc_reduce_x64,.-sc_reduce_x64 #endif /* __APPLE__ */ -#ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ .text -.globl fe_mul_avx2 -.type fe_mul_avx2,@function +.globl sc_muladd_x64 +.type sc_muladd_x64,@function .align 16 -fe_mul_avx2: +sc_muladd_x64: #else .section __TEXT,__text -.globl _fe_mul_avx2 +.globl _sc_muladd_x64 .p2align 4 -_fe_mul_avx2: +_sc_muladd_x64: #endif /* __APPLE__ */ + pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx - movq %rdx, %rbx + movq %rdx, %rbp # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rax, %rcx - xorq %r15, %r15 - adcxq %rax, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rcx, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rax, %rcx - adoxq %rax, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rax, %r14 - adoxq %rcx, %r10 - adcxq %rax, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rax, %rcx - adcxq %r14, %r12 - adoxq %rax, %r11 - adcxq %r15, %r13 - adoxq %rcx, %r12 - # A[0] * B[2] - mulxq (%rsi), %rax, %rcx - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rax, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rax - adcxq %rcx, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rax, %r11 - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rcx, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rax, %r13 - mulxq 24(%rsi), %rax, %rcx - adoxq %r15, %r14 - adcxq %rax, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rax - adcxq %rcx, %r15 - xorq %rcx, %rcx - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rax, %r12 - mulxq 24(%rsi), %rdx, %rax - adoxq %rdx, %r11 - adoxq %rax, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rax, %r14 - mulxq 24(%rsi), %rax, %rdx - adcxq %rcx, %r15 + # A[0] * B[0] + movq (%rbp), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + # A[0] * B[1] + movq 8(%rbp), %rax + mulq (%rsi) + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + # A[1] * B[0] + movq (%rbp), %rax + mulq 8(%rsi) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %r11 + # A[0] * B[2] + movq 16(%rbp), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + # A[1] * B[1] + movq 8(%rbp), %rax + mulq 8(%rsi) + xorq %r12, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[2] * B[0] + movq (%rbp), %rax + mulq 16(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x00, %r12 + # A[0] * B[3] + movq 24(%rbp), %rax + mulq (%rsi) + xorq %r13, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[2] + movq 16(%rbp), %rax + mulq 8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[2] * B[1] + movq 8(%rbp), %rax + mulq 16(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[3] * B[0] + movq (%rbp), %rax + mulq 24(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x00, %r13 + # A[1] * B[3] + movq 24(%rbp), %rax + mulq 8(%rsi) + xorq %r14, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[2] + movq 16(%rbp), %rax + mulq 16(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[3] * B[1] + movq 8(%rbp), %rax + mulq 24(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x00, %r14 + # A[2] * B[3] + movq 24(%rbp), %rax + mulq 16(%rsi) + xorq %r15, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[2] + movq 16(%rbp), %rax + mulq 24(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + # A[3] * B[3] + movq 24(%rbp), %rax + mulq 24(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + # Add c to a * b + addq (%rcx), %r8 + adcq 8(%rcx), %r9 + adcq 16(%rcx), %r10 + adcq 24(%rcx), %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + movq %r15, %rbx + movq $0xfffffffffffffff, %rcx + shrq $56, %rbx + shldq $4, %r14, %r15 + shldq $4, %r13, %r14 + shldq $4, %r12, %r13 + shldq $4, %r11, %r12 + andq %rcx, %r11 + andq %rcx, %r15 + # Add order times bits 504..507 + subq %rbx, %r14 + sbbq $0x00, %r15 + movq $0xeb2106215d086329, %rax + mulq %rbx + movq $0x00, %rcx + addq %rax, %r13 + movq $0xa7ed9ce5a30a2c13, %rax + adcq %rdx, %rcx + mulq %rbx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %rcx, %r14 + adcq $0x00, %r15 + # Sub product of top 4 words and order + movq $0xa7ed9ce5a30a2c13, %rbx + movq %r12, %rax + mulq %rbx + mov $0x00, %rbp + addq %rax, %r8 + adcq %rdx, %rbp + movq %r13, %rax + mulq %rbx + movq $0x00, %rcx + addq %rax, %r9 + adcq %rdx, %rcx + movq %r14, %rax + mulq %rbx + addq %rbp, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + mov $0x00, %rsi + adcq $0x00, %rsi + movq %r15, %rax + mulq %rbx + addq %rcx, %r10 + adcq %rax, %r11 + adcq %rdx, %rsi + movq $0xeb2106215d086329, %rbx + movq %r12, %rax + mulq %rbx + mov $0x00, %rbp + addq %rax, %r9 + adcq %rdx, %rbp + movq %r13, %rax + mulq %rbx + movq $0x00, %rcx + addq %rax, %r10 + adcq %rdx, %rcx + movq %r14, %rax + mulq %rbx + addq %rbp, %r10 + adcq %rax, %r11 + adcq %rdx, %rsi + mov $0x00, %rbp + adcq $0x00, %rbp + movq %r15, %rax + mulq %rbx + addq %rcx, %r11 + adcq %rax, %rsi + adcq %rdx, %rbp + subq %r12, %r10 + movq %rsi, %r12 + sbbq %r13, %r11 + movq %rbp, %r13 + sbbq %r14, %r12 + sbbq %r15, %r13 + movq %r13, %rbx + sarq $57, %rbx + # Conditionally subtract order starting at bit 125 + movq $0xa000000000000000, %rax + movq $0xcb024c634b9eba7d, %rdx + movq $0x29bdf3bd45ef39a, %rsi + movq $0x200000000000000, %rbp + andq %rbx, %rax + andq %rbx, %rdx + andq %rbx, %rsi + andq %rbx, %rbp + addq %rax, %r9 + adcq %rdx, %r10 + adcq %rsi, %r11 + adcq $0x00, %r12 + adcq %rbp, %r13 + # Move bits 252-376 to own registers + movq $0xfffffffffffffff, %rbx + shldq $4, %r12, %r13 + shldq $4, %r11, %r12 + andq %rbx, %r11 + # Sub product of top 2 words and order + # * -5812631a5cf5d3ed + movq $0xa7ed9ce5a30a2c13, %rbx + movq %r12, %rax + mulq %rbx + movq $0x00, %rsi + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x00, %rsi + movq %r13, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %rsi + # * -14def9dea2f79cd7 + movq $0xeb2106215d086329, %rbx + movq %r12, %rax + mulq %rbx + movq $0x00, %rbp + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x00, %rbp + movq %r13, %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %rbp + # Add overflows at 2 * 64 + movq $0xfffffffffffffff, %rcx + andq %rcx, %r11 + addq %rsi, %r10 + adcq %rbp, %r11 + # Subtract top at 2 * 64 + subq %r12, %r10 + sbbq %r13, %r11 + sbbq %rcx, %rcx + # Conditional sub order + movq $0x5812631a5cf5d3ed, %rax + movq $0x14def9dea2f79cd6, %rdx + movq $0x1000000000000000, %rsi + andq %rcx, %rax + andq %rcx, %rdx + andq %rcx, %rsi + addq %rax, %r8 + movq $0xfffffffffffffff, %rax + adcq %rdx, %r9 + adcq $0x00, %r10 + adcq %rsi, %r11 + andq %rax, %r11 + # Store result + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + repz retq +#ifndef __APPLE__ +.size sc_muladd_x64,.-sc_muladd_x64 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_INTEL_AVX2 +#ifndef __APPLE__ +.text +.globl fe_mul_avx2 +.type fe_mul_avx2,@function +.align 16 +fe_mul_avx2: +#else +.section __TEXT,__text +.globl _fe_mul_avx2 +.p2align 4 +_fe_mul_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rbx + # Multiply + # A[0] * B[0] + movq (%rbx), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rax, %rcx + xorq %r15, %r15 + adcxq %rax, %r9 + # A[3] * B[1] + movq 8(%rbx), %rdx + mulxq 24(%rsi), %r12, %r13 + adcxq %rcx, %r10 + # A[0] * B[1] + mulxq (%rsi), %rax, %rcx + adoxq %rax, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rax, %r14 + adoxq %rcx, %r10 + adcxq %rax, %r11 + # A[1] * B[2] + movq 16(%rbx), %rdx + mulxq 8(%rsi), %rax, %rcx + adcxq %r14, %r12 + adoxq %rax, %r11 + adcxq %r15, %r13 + adoxq %rcx, %r12 + # A[0] * B[2] + mulxq (%rsi), %rax, %rcx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rax, %r10 + # A[1] * B[1] + movq 8(%rbx), %rdx + mulxq 8(%rsi), %rdx, %rax + adcxq %rcx, %r11 + adoxq %rdx, %r10 + # A[1] * B[3] + movq 24(%rbx), %rdx + adoxq %rax, %r11 + mulxq 8(%rsi), %rax, %rcx + adcxq %rax, %r12 + # A[2] * B[2] + movq 16(%rbx), %rdx + mulxq 16(%rsi), %rdx, %rax + adcxq %rcx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbx), %rdx + adoxq %rax, %r13 + mulxq 24(%rsi), %rax, %rcx + adoxq %r15, %r14 + adcxq %rax, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rax + adcxq %rcx, %r15 + xorq %rcx, %rcx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 24(%rsi), %rdx + adcxq %rax, %r12 + mulxq (%rbx), %rdx, %rax + adoxq %rdx, %r11 + adoxq %rax, %r12 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rbx), %rdx, %rax + adcxq %rdx, %r13 + # A[2] * B[3] + movq 24(%rbx), %rdx + adcxq %rax, %r14 + mulxq 16(%rsi), %rax, %rdx + adcxq %rcx, %r15 adoxq %rax, %r13 adoxq %rdx, %r14 adoxq %rcx, %r15 - # Reduce + movq $38, %rdx + mulxq %r15, %r15, %rax + addq %r15, %r11 + adcq $0x00, %rax movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rax + imulq $19, %rax, %rax andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx xorq %rcx, %rcx + adoxq %rax, %r8 mulxq %r12, %rax, %r12 adcxq %rax, %r8 adoxq %r12, %r9 @@ -9663,20 +9210,8 @@ _fe_mul_avx2: mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx + adcxq %rcx, %r11 movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx @@ -9717,93 +9252,81 @@ _fe_sq_avx2: pushq %r14 pushq %r15 # Square - # A[0] * A[1] movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 + movq 8(%rsi), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rbx - xorq %r15, %r15 + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx - adoxq %r15, %r13 + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 adcxq %rcx, %r10 - adoxq %r15, %r14 + adoxq %r8, %r14 # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rax, %r8 + movq %rax, %rdx + mulxq 24(%rsi), %rcx, %rdx adcxq %rbx, %r11 - adcxq %rax, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rax + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] - movq 8(%rsi), %rdx + movq %rax, %rdx + adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 - adoxq %rax, %r9 - adcxq %r11, %r11 adoxq %rcx, %r10 + adcxq %r11, %r11 # A[2] * A[2] movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r12, %r12 adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 adcxq %r13, %r13 - adoxq %rax, %r12 # A[3] * A[3] movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r14, %r14 adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 adcxq %r15, %r15 - adoxq %rax, %r14 adoxq %rbx, %r15 - # Reduce + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx xorq %rcx, %rcx - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx + adcxq %rcx, %r11 movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx @@ -9847,101 +9370,80 @@ _fe_sq_n_avx2: movq %rdx, %rbp L_fe_sq_n_avx2: # Square - # A[0] * A[1] movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 + movq 8(%rsi), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rbx - xorq %r15, %r15 + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx - adoxq %r15, %r13 + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 adcxq %rcx, %r10 - adoxq %r15, %r14 + adoxq %r8, %r14 # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rax, %r8 + movq %rax, %rdx + mulxq 24(%rsi), %rcx, %rdx adcxq %rbx, %r11 - adcxq %rax, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rax + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] - movq 8(%rsi), %rdx + movq %rax, %rdx + adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 - adoxq %rax, %r9 - adcxq %r11, %r11 adoxq %rcx, %r10 + adcxq %r11, %r11 # A[2] * A[2] movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r12, %r12 adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 adcxq %r13, %r13 - adoxq %rax, %r12 # A[3] * A[3] movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r14, %r14 adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 adcxq %r15, %r15 - adoxq %rax, %r14 adoxq %rbx, %r15 - # Reduce + movq $38, %rdx + mulxq %r15, %r15, %rbx + addq %r15, %r11 + adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 + shldq $0x01, %r11, %rbx + imulq $19, %rbx, %rbx andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx xorq %rcx, %rcx - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 + adoxq %rbx, %r8 + mulxq %r12, %rbx, %r12 + adcxq %rbx, %r8 adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + adcxq %rcx, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) @@ -10002,144 +9504,6 @@ _fe_mul121666_avx2: #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_sq2_avx2 -.type fe_sq2_avx2,@function -.align 16 -fe_sq2_avx2: -#else -.section __TEXT,__text -.globl _fe_sq2_avx2 -.p2align 4 -_fe_sq2_avx2: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - # Square * 2 - # A[0] * A[1] - movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 - # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] - movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rbx - xorq %r15, %r15 - adoxq %rcx, %r11 - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 - # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rax, %r8 - adcxq %rbx, %r11 - adcxq %rax, %r12 - adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 - # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rax - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rcx, %rbx - adcxq %r10, %r10 - adoxq %rax, %r9 - adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r12, %r12 - adoxq %rbx, %r11 - adcxq %r13, %r13 - adoxq %rax, %r12 - # A[3] * A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r14, %r14 - adoxq %rcx, %r13 - adcxq %r15, %r15 - adoxq %rax, %r14 - adoxq %rbx, %r15 - # Reduce - movq $0x7fffffffffffffff, %rbx - xorq %rax, %rax - # Move top half into t4-t7 and remove top bit from t3 and double - shldq $3, %r15, %rax - shldq $2, %r14, %r15 - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $0x01, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shlq $0x01, %r8 - andq %rbx, %r11 - # Two out left, one in right - andq %rbx, %r15 - # Multiply top bits by 19*19 - imulq $0x169, %rax, %rcx - xorq %rbx, %rbx - # Multiply top half by 19 - movq $19, %rdx - adoxq %rcx, %r8 - mulxq %r12, %rax, %r12 - adcxq %rax, %r8 - adoxq %r12, %r9 - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 - adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rax - andq %rbx, %r11 - addq %rax, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size fe_sq2_avx2,.-fe_sq2_avx2 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text .globl fe_invert_avx2 .type fe_invert_avx2,@function .align 16 @@ -10413,9 +9777,9 @@ _curve25519_avx2: pushq %r15 pushq %rbp movq %rdx, %r8 - subq $0xc0, %rsp - movq $0x00, 184(%rsp) - movq %rdi, 176(%rsp) + subq $0xb8, %rsp + movq $0x00, 176(%rsp) + movq %rdi, 168(%rsp) # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) @@ -10440,18 +9804,18 @@ _curve25519_avx2: movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) - movb $62, 168(%rsp) - movq $3, 160(%rsp) -L_curve25519_avx2_words: + movq $0xfe, %rbx L_curve25519_avx2_bits: - movq 184(%rsp), %rbx - movq 160(%rsp), %r9 - movb 168(%rsp), %cl - movq (%rsi,%r9,8), %rax - shrq %cl, %rax - andq $0x01, %rax - xorq %rax, %rbx - negq %rbx + movq 176(%rsp), %rax + movq %rbx, 160(%rsp) + movq %rbx, %rcx + andq $63, %rcx + shrq $6, %rbx + movq (%rsi,%rbx,8), %rbx + shrq %cl, %rbx + andq $0x01, %rbx + xorq %rbx, %rax + negq %rax # Conditional Swap movq (%rdi), %r9 movq 8(%rdi), %r10 @@ -10461,10 +9825,10 @@ L_curve25519_avx2_bits: xorq 72(%rsp), %r10 xorq 80(%rsp), %r11 xorq 88(%rsp), %r12 - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - andq %rbx, %r12 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + andq %rax, %r12 xorq %r9, (%rdi) xorq %r10, 8(%rdi) xorq %r11, 16(%rdi) @@ -10482,10 +9846,10 @@ L_curve25519_avx2_bits: xorq 40(%rsp), %r10 xorq 48(%rsp), %r11 xorq 56(%rsp), %r12 - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - andq %rbx, %r12 + andq %rax, %r9 + andq %rax, %r10 + andq %rax, %r11 + andq %rax, %r12 xorq %r9, (%rsp) xorq %r10, 8(%rsp) xorq %r11, 16(%rsp) @@ -10494,49 +9858,46 @@ L_curve25519_avx2_bits: xorq %r10, 40(%rsp) xorq %r11, 48(%rsp) xorq %r12, 56(%rsp) - movq %rax, 184(%rsp) + movq %rbx, 176(%rsp) + # Add-Sub # Add movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 - movq 24(%rdi), %rax + movq 24(%rdi), %r12 movq %r9, %r13 addq (%rsp), %r9 movq %r10, %r14 adcq 8(%rsp), %r10 movq %r11, %r15 adcq 16(%rsp), %r11 - movq %rax, %rbp - adcq 24(%rsp), %rax - movq $-19, %rcx - movq %rax, %r12 + movq %r12, %rbp + adcq 24(%rsp), %r12 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r12, %rcx movq $0x7fffffffffffffff, %rbx - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + imulq $19, %rcx + andq %rbx, %r12 # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 # Sub subq (%rsp), %r13 - movq $0x00, %rax sbbq 8(%rsp), %r14 - movq $-19, %rcx sbbq 16(%rsp), %r15 - movq $0x7fffffffffffffff, %rbx sbbq 24(%rsp), %rbp - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + sbbq %rcx, %rcx + shldq $0x01, %rbp, %rcx + imulq $-19, %rcx + andq %rbx, %rbp # Add modulus (if underflow) - addq %rcx, %r13 - adcq %rax, %r14 - adcq %rax, %r15 - adcq %rbx, %rbp + subq %rcx, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbp movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) @@ -10545,145 +9906,140 @@ L_curve25519_avx2_bits: movq %r14, 136(%rsp) movq %r15, 144(%rsp) movq %rbp, 152(%rsp) + # Add-Sub # Add movq 64(%rsp), %r9 movq 72(%rsp), %r10 movq 80(%rsp), %r11 - movq 88(%rsp), %rax + movq 88(%rsp), %r12 movq %r9, %r13 addq 32(%rsp), %r9 movq %r10, %r14 adcq 40(%rsp), %r10 movq %r11, %r15 adcq 48(%rsp), %r11 - movq %rax, %rbp - adcq 56(%rsp), %rax - movq $-19, %rcx - movq %rax, %r12 + movq %r12, %rbp + adcq 56(%rsp), %r12 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r12, %rcx movq $0x7fffffffffffffff, %rbx - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + imulq $19, %rcx + andq %rbx, %r12 # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 # Sub subq 32(%rsp), %r13 - movq $0x00, %rax sbbq 40(%rsp), %r14 - movq $-19, %rcx sbbq 48(%rsp), %r15 - movq $0x7fffffffffffffff, %rbx sbbq 56(%rsp), %rbp - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + sbbq %rcx, %rcx + shldq $0x01, %rbp, %rcx + imulq $-19, %rcx + andq %rbx, %rbp # Add modulus (if underflow) - addq %rcx, %r13 - adcq %rax, %r14 - adcq %rax, %r15 - adcq %rbx, %rbp - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) + subq %rcx, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbp + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + movq %r12, 56(%rsp) movq %r13, 96(%rsp) movq %r14, 104(%rsp) movq %r15, 112(%rsp) movq %rbp, 120(%rsp) # Multiply # A[0] * B[0] - movq (%rdi), %rdx - mulxq 96(%rsp), %r9, %r10 + movq 128(%rsp), %rdx + mulxq 32(%rsp), %r9, %r10 # A[2] * B[0] - mulxq 112(%rsp), %r11, %r12 + mulxq 48(%rsp), %r11, %r12 # A[1] * B[0] - mulxq 104(%rsp), %rcx, %rbx + mulxq 40(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 - # A[1] * B[3] - movq 24(%rdi), %rdx - mulxq 104(%rsp), %r13, %r14 + # A[3] * B[1] + movq 136(%rsp), %rdx + mulxq 56(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - movq 8(%rdi), %rdx - mulxq 96(%rsp), %rcx, %rbx + mulxq 32(%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] - mulxq 112(%rsp), %rcx, %r15 + mulxq 48(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rdi), %rdx - mulxq 104(%rsp), %rcx, %rbx + movq 144(%rsp), %rdx + mulxq 40(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq 96(%rsp), %rcx, %rbx + mulxq 32(%rsp), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rdi), %rdx - mulxq 104(%rsp), %rdx, %rcx + movq 136(%rsp), %rdx + mulxq 40(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 - # A[3] * B[1] - movq 8(%rdi), %rdx + # A[1] * B[3] + movq 152(%rsp), %rdx adoxq %rcx, %r12 - mulxq 120(%rsp), %rcx, %rbx + mulxq 40(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rdi), %rdx - mulxq 112(%rsp), %rdx, %rcx + movq 144(%rsp), %rdx + mulxq 48(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rdi), %rdx + movq 152(%rsp), %rdx adoxq %rcx, %r14 - mulxq 120(%rsp), %rcx, %rbx + mulxq 56(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq 96(%rsp), %rdx, %rcx + mulxq 32(%rsp), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq (%rdi), %rdx + movq 56(%rsp), %rdx adcxq %rcx, %r13 - mulxq 120(%rsp), %rdx, %rcx + mulxq 128(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 - # A[2] * B[3] - movq 24(%rdi), %rdx - mulxq 112(%rsp), %rdx, %rcx - adcxq %rdx, %r14 # A[3] * B[2] - movq 16(%rdi), %rdx + movq 56(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 152(%rsp), %rdx adcxq %rcx, %r15 - mulxq 120(%rsp), %rcx, %rdx + mulxq 48(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rbx, %rbx + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 @@ -10693,28 +10049,7 @@ L_curve25519_avx2_bits: mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rbx, %r12 # Store movq %r9, 32(%rsp) movq %r10, 40(%rsp) @@ -10722,93 +10057,91 @@ L_curve25519_avx2_bits: movq %r12, 56(%rsp) # Multiply # A[0] * B[0] - movq 128(%rsp), %rdx - mulxq (%rsp), %r9, %r10 + movq (%rdi), %rdx + mulxq 96(%rsp), %r9, %r10 # A[2] * B[0] - mulxq 16(%rsp), %r11, %r12 + mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] - mulxq 8(%rsp), %rcx, %rbx + mulxq 104(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 - # A[1] * B[3] - movq 152(%rsp), %rdx - mulxq 8(%rsp), %r13, %r14 + # A[3] * B[1] + movq 8(%rdi), %rdx + mulxq 120(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - movq 136(%rsp), %rdx - mulxq (%rsp), %rcx, %rbx + mulxq 96(%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rsp), %rcx, %r15 + mulxq 112(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] - movq 144(%rsp), %rdx - mulxq 8(%rsp), %rcx, %rbx + movq 16(%rdi), %rdx + mulxq 104(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq (%rsp), %rcx, %rbx + mulxq 96(%rsp), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] - movq 136(%rsp), %rdx - mulxq 8(%rsp), %rdx, %rcx + movq 8(%rdi), %rdx + mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 - # A[3] * B[1] - movq 136(%rsp), %rdx + # A[1] * B[3] + movq 24(%rdi), %rdx adoxq %rcx, %r12 - mulxq 24(%rsp), %rcx, %rbx + mulxq 104(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] - movq 144(%rsp), %rdx - mulxq 16(%rsp), %rdx, %rcx + movq 16(%rdi), %rdx + mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] - movq 152(%rsp), %rdx + movq 24(%rdi), %rdx adoxq %rcx, %r14 - mulxq 24(%rsp), %rcx, %rbx + mulxq 120(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq (%rsp), %rdx, %rcx + mulxq 96(%rsp), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq 128(%rsp), %rdx + movq 120(%rsp), %rdx adcxq %rcx, %r13 - mulxq 24(%rsp), %rdx, %rcx + mulxq (%rdi), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 - # A[2] * B[3] - movq 152(%rsp), %rdx - mulxq 16(%rsp), %rdx, %rcx - adcxq %rdx, %r14 # A[3] * B[2] - movq 144(%rsp), %rdx + movq 120(%rsp), %rdx + mulxq 16(%rdi), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rdi), %rdx adcxq %rcx, %r15 - mulxq 24(%rsp), %rcx, %rdx + mulxq 112(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rbx, %rbx + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 @@ -10818,285 +10151,219 @@ L_curve25519_avx2_bits: mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rbx, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) # Square - # A[0] * A[1] movq 128(%rsp), %rdx - mulxq 136(%rsp), %r10, %r11 + movq 136(%rsp), %rax + # A[0] * A[1] + movq %rdx, %rbp + mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 152(%rsp), %r12, %r13 # A[2] * A[1] movq 144(%rsp), %rdx - mulxq 136(%rsp), %rcx, %rbx - xorq %rbp, %rbp + mulxq %rax, %rcx, %rbx + xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 152(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] - mulxq 128(%rsp), %rcx, %rbx - adoxq %rbp, %r14 + mulxq %rbp, %rcx, %rbx + adoxq %r9, %r14 adcxq %rcx, %r11 - adoxq %rbp, %r15 + adoxq %r9, %r15 # A[1] * A[3] - movq 136(%rsp), %rdx - mulxq 152(%rsp), %rax, %r9 + movq %rax, %rdx + mulxq 152(%rsp), %rcx, %rdx adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp + adcxq %rcx, %r13 + adcxq %rdx, %r14 + adcxq %r9, %r15 # A[0] * A[0] - movq 128(%rsp), %rdx - mulxq %rdx, %r9, %rax + movq %rbp, %rdx + mulxq %rdx, %r9, %rcx + xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] - movq 136(%rsp), %rdx + movq %rax, %rdx + adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 adoxq %rcx, %r11 + adcxq %r12, %r12 # A[2] * A[2] movq 144(%rsp), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 adoxq %rbx, %r12 + mulxq %rdx, %rbx, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r13 adcxq %r14, %r14 - adoxq %rax, %r13 # A[3] * A[3] movq 152(%rsp), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 adoxq %rcx, %r14 + mulxq %rdx, %rcx, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r15 adcxq %rbp, %rbp - adoxq %rax, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rbx + addq %rbp, %r12 + adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rbx + imulq $19, %rbx, %rbx andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 + adoxq %rbx, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 + mulxq %r15, %rbx, %r15 + adcxq %rbx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rcx, %r12 # Store movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) # Square - # A[0] * A[1] movq (%rdi), %rdx - mulxq 8(%rdi), %r10, %r11 + movq 8(%rdi), %rax + # A[0] * A[1] + movq %rdx, %rbp + mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 24(%rdi), %r12, %r13 # A[2] * A[1] movq 16(%rdi), %rdx - mulxq 8(%rdi), %rcx, %rbx - xorq %rbp, %rbp + mulxq %rax, %rcx, %rbx + xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 24(%rdi), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] - mulxq (%rdi), %rcx, %rbx - adoxq %rbp, %r14 + mulxq %rbp, %rcx, %rbx + adoxq %r9, %r14 adcxq %rcx, %r11 - adoxq %rbp, %r15 + adoxq %r9, %r15 # A[1] * A[3] - movq 8(%rdi), %rdx - mulxq 24(%rdi), %rax, %r9 + movq %rax, %rdx + mulxq 24(%rdi), %rcx, %rdx adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp + adcxq %rcx, %r13 + adcxq %rdx, %r14 + adcxq %r9, %r15 # A[0] * A[0] - movq (%rdi), %rdx - mulxq %rdx, %r9, %rax + movq %rbp, %rdx + mulxq %rdx, %r9, %rcx + xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] - movq 8(%rdi), %rdx + movq %rax, %rdx + adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 adoxq %rcx, %r11 + adcxq %r12, %r12 # A[2] * A[2] movq 16(%rdi), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 adoxq %rbx, %r12 + mulxq %rdx, %rbx, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r13 adcxq %r14, %r14 - adoxq %rax, %r13 # A[3] * A[3] movq 24(%rdi), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 adoxq %rcx, %r14 + mulxq %rdx, %rcx, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r15 adcxq %rbp, %rbp - adoxq %rax, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rbx + addq %rbp, %r12 + adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rbx + imulq $19, %rbx, %rbx andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 + adoxq %rbx, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 + mulxq %r15, %rbx, %r15 + adcxq %rbx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rcx, %r12 # Store movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) + # Add-Sub # Add - movq 32(%rsp), %r9 - movq 40(%rsp), %r10 - movq 48(%rsp), %r11 - movq 56(%rsp), %rax + movq (%rsp), %r9 + movq 8(%rsp), %r10 + movq 16(%rsp), %r11 + movq 24(%rsp), %r12 movq %r9, %r13 - addq (%rsp), %r9 + addq 32(%rsp), %r9 movq %r10, %r14 - adcq 8(%rsp), %r10 + adcq 40(%rsp), %r10 movq %r11, %r15 - adcq 16(%rsp), %r11 - movq %rax, %rbp - adcq 24(%rsp), %rax - movq $-19, %rcx - movq %rax, %r12 + adcq 48(%rsp), %r11 + movq %r12, %rbp + adcq 56(%rsp), %r12 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r12, %rcx movq $0x7fffffffffffffff, %rbx - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + imulq $19, %rcx + andq %rbx, %r12 # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 # Sub - subq (%rsp), %r13 - movq $0x00, %rax - sbbq 8(%rsp), %r14 - movq $-19, %rcx - sbbq 16(%rsp), %r15 - movq $0x7fffffffffffffff, %rbx - sbbq 24(%rsp), %rbp - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + subq 32(%rsp), %r13 + sbbq 40(%rsp), %r14 + sbbq 48(%rsp), %r15 + sbbq 56(%rsp), %rbp + sbbq %rcx, %rcx + shldq $0x01, %rbp, %rcx + imulq $-19, %rcx + andq %rbx, %rbp # Add modulus (if underflow) - addq %rcx, %r13 - adcq %rax, %r14 - adcq %rax, %r15 - adcq %rbx, %rbp + subq %rcx, %r13 + sbbq $0x00, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbp movq %r9, 64(%rsp) movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) - movq %r13, (%rsp) - movq %r14, 8(%rsp) - movq %r15, 16(%rsp) - movq %rbp, 24(%rsp) + movq %r13, 32(%rsp) + movq %r14, 40(%rsp) + movq %r15, 48(%rsp) + movq %rbp, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rdx @@ -11107,12 +10374,11 @@ L_curve25519_avx2_bits: mulxq 136(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 - # A[1] * B[3] - movq 120(%rsp), %rdx - mulxq 136(%rsp), %r13, %r14 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - movq 104(%rsp), %rdx mulxq 128(%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] @@ -11136,10 +10402,10 @@ L_curve25519_avx2_bits: mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 - # A[3] * B[1] - movq 104(%rsp), %rdx + # A[1] * B[3] + movq 120(%rsp), %rdx adoxq %rcx, %r12 - mulxq 152(%rsp), %rcx, %rbx + mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 112(%rsp), %rdx @@ -11158,34 +10424,33 @@ L_curve25519_avx2_bits: xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq 96(%rsp), %rdx + movq 152(%rsp), %rdx adcxq %rcx, %r13 - mulxq 152(%rsp), %rdx, %rcx + mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r14 # A[2] * B[3] movq 120(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 112(%rsp), %rdx adcxq %rcx, %r15 - mulxq 152(%rsp), %rcx, %rdx + mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rbx, %rbx + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 @@ -11195,28 +10460,7 @@ L_curve25519_avx2_bits: mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rbx, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) @@ -11228,126 +10472,103 @@ L_curve25519_avx2_bits: movq 144(%rsp), %r11 movq 152(%rsp), %r12 subq 96(%rsp), %r9 - movq $0x00, %rax sbbq 104(%rsp), %r10 - movq $-19, %rcx sbbq 112(%rsp), %r11 - movq $0x7fffffffffffffff, %rbx sbbq 120(%rsp), %r12 - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + sbbq %rcx, %rcx + shldq $0x01, %r12, %rcx + movq $0x7fffffffffffffff, %rbx + imulq $-19, %rcx + andq %rbx, %r12 # Add modulus (if underflow) - addq %rcx, %r9 - adcq %rax, %r10 - adcq %rax, %r11 - adcq %rbx, %r12 + subq %rcx, %r9 + sbbq $0x00, %r10 + sbbq $0x00, %r11 + sbbq $0x00, %r12 movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) # Square + movq 32(%rsp), %rdx + movq 40(%rsp), %rax # A[0] * A[1] - movq (%rsp), %rdx - mulxq 8(%rsp), %r10, %r11 + movq %rdx, %rbp + mulxq %rax, %r10, %r11 # A[0] * A[3] - mulxq 24(%rsp), %r12, %r13 + mulxq 56(%rsp), %r12, %r13 # A[2] * A[1] - movq 16(%rsp), %rdx - mulxq 8(%rsp), %rcx, %rbx - xorq %rbp, %rbp + movq 48(%rsp), %rdx + mulxq %rax, %rcx, %rbx + xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] - mulxq 24(%rsp), %r14, %r15 + mulxq 56(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] - mulxq (%rsp), %rcx, %rbx - adoxq %rbp, %r14 + mulxq %rbp, %rcx, %rbx + adoxq %r9, %r14 adcxq %rcx, %r11 - adoxq %rbp, %r15 + adoxq %r9, %r15 # A[1] * A[3] - movq 8(%rsp), %rdx - mulxq 24(%rsp), %rax, %r9 + movq %rax, %rdx + mulxq 56(%rsp), %rcx, %rdx adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp + adcxq %rcx, %r13 + adcxq %rdx, %r14 + adcxq %r9, %r15 # A[0] * A[0] - movq (%rsp), %rdx - mulxq %rdx, %r9, %rax + movq %rbp, %rdx + mulxq %rdx, %r9, %rcx + xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] - movq 8(%rsp), %rdx + movq %rax, %rdx + adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 adoxq %rcx, %r11 + adcxq %r12, %r12 # A[2] * A[2] - movq 16(%rsp), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 + movq 48(%rsp), %rdx adoxq %rbx, %r12 + mulxq %rdx, %rbx, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r13 adcxq %r14, %r14 - adoxq %rax, %r13 # A[3] * A[3] - movq 24(%rsp), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 + movq 56(%rsp), %rdx adoxq %rcx, %r14 + mulxq %rdx, %rcx, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r15 adcxq %rbp, %rbp - adoxq %rax, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rbx + addq %rbp, %r12 + adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rbx + imulq $19, %rbx, %rbx andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 + adoxq %rbx, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 + mulxq %r15, %rbx, %r15 + adcxq %rbx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rcx, %r12 # Store - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) + movq %r9, 32(%rsp) + movq %r10, 40(%rsp) + movq %r11, 48(%rsp) + movq %r12, 56(%rsp) movq $0x1db42, %rdx mulxq 128(%rsp), %r9, %rbp mulxq 136(%rsp), %r10, %r15 @@ -11365,106 +10586,85 @@ L_curve25519_avx2_bits: adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 - movq %r9, 32(%rsp) - movq %r10, 40(%rsp) - movq %r11, 48(%rsp) - movq %r12, 56(%rsp) + movq %r9, (%rsp) + movq %r10, 8(%rsp) + movq %r11, 16(%rsp) + movq %r12, 24(%rsp) # Square - # A[0] * A[1] movq 64(%rsp), %rdx - mulxq 72(%rsp), %r10, %r11 + movq 72(%rsp), %rax + # A[0] * A[1] + movq %rdx, %rbp + mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 88(%rsp), %r12, %r13 # A[2] * A[1] movq 80(%rsp), %rdx - mulxq 72(%rsp), %rcx, %rbx - xorq %rbp, %rbp + mulxq %rax, %rcx, %rbx + xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 88(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] - mulxq 64(%rsp), %rcx, %rbx - adoxq %rbp, %r14 + mulxq %rbp, %rcx, %rbx + adoxq %r9, %r14 adcxq %rcx, %r11 - adoxq %rbp, %r15 + adoxq %r9, %r15 # A[1] * A[3] - movq 72(%rsp), %rdx - mulxq 88(%rsp), %rax, %r9 + movq %rax, %rdx + mulxq 88(%rsp), %rcx, %rdx adcxq %rbx, %r12 - adcxq %rax, %r13 - adcxq %r9, %r14 - adcxq %rbp, %r15 - # Double with Carry Flag - xorq %rbp, %rbp + adcxq %rcx, %r13 + adcxq %rdx, %r14 + adcxq %r9, %r15 # A[0] * A[0] - movq 64(%rsp), %rdx - mulxq %rdx, %r9, %rax + movq %rbp, %rdx + mulxq %rdx, %r9, %rcx + xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] - movq 72(%rsp), %rdx + movq %rax, %rdx + adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 - adoxq %rax, %r10 - adcxq %r12, %r12 adoxq %rcx, %r11 + adcxq %r12, %r12 # A[2] * A[2] movq 80(%rsp), %rdx - mulxq %rdx, %rax, %rcx - adcxq %r13, %r13 adoxq %rbx, %r12 + mulxq %rdx, %rbx, %rcx + adcxq %r13, %r13 + adoxq %rbx, %r13 adcxq %r14, %r14 - adoxq %rax, %r13 # A[3] * A[3] movq 88(%rsp), %rdx - mulxq %rdx, %rax, %rbx - adcxq %r15, %r15 adoxq %rcx, %r14 + mulxq %rdx, %rcx, %rbx + adcxq %r15, %r15 + adoxq %rcx, %r15 adcxq %rbp, %rbp - adoxq %rax, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rbx + addq %rbp, %r12 + adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rbx + imulq $19, %rbx, %rbx andq %rcx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rcx, %rcx - mulxq %r13, %rax, %r13 - adcxq %rax, %r9 + adoxq %rbx, %r9 + mulxq %r13, %rbx, %r13 + adcxq %rbx, %r9 adoxq %r13, %r10 - mulxq %r14, %rax, %r14 - adcxq %rax, %r10 + mulxq %r14, %rbx, %r14 + adcxq %rbx, %r10 adoxq %r14, %r11 - mulxq %r15, %rax, %r15 - adcxq %rax, %r11 + mulxq %r15, %rbx, %r15 + adcxq %rbx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rax - andq %rcx, %r12 - addq %rax, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rcx, %r12 # Store movq %r9, 64(%rsp) movq %r10, 72(%rsp) @@ -11473,31 +10673,30 @@ L_curve25519_avx2_bits: # Add movq 96(%rsp), %r9 movq 104(%rsp), %r10 - addq 32(%rsp), %r9 + addq (%rsp), %r9 movq 112(%rsp), %r11 - adcq 40(%rsp), %r10 - movq 120(%rsp), %rax - adcq 48(%rsp), %r11 - movq $-19, %rcx - adcq 56(%rsp), %rax + adcq 8(%rsp), %r10 + movq 120(%rsp), %r12 + adcq 16(%rsp), %r11 + adcq 24(%rsp), %r12 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r12, %rcx movq $0x7fffffffffffffff, %rbx - movq %rax, %r12 - sarq $63, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx + imulq $19, %rcx + andq %rbx, %r12 # Sub modulus (if overflow) - subq %rcx, %r9 - sbbq %rax, %r10 - sbbq %rax, %r11 - sbbq %rbx, %r12 + addq %rcx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) # Multiply # A[0] * B[0] - movq (%rsp), %rdx + movq 32(%rsp), %rdx mulxq (%r8), %r9, %r10 # A[2] * B[0] mulxq 16(%r8), %r11, %r12 @@ -11505,12 +10704,11 @@ L_curve25519_avx2_bits: mulxq 8(%r8), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 - # A[1] * B[3] - movq 24(%rsp), %rdx - mulxq 8(%r8), %r13, %r14 + # A[3] * B[1] + movq 40(%rsp), %rdx + mulxq 24(%r8), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - movq 8(%rsp), %rdx mulxq (%r8), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] @@ -11518,7 +10716,7 @@ L_curve25519_avx2_bits: adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rsp), %rdx + movq 48(%rsp), %rdx mulxq 8(%r8), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 @@ -11530,22 +10728,22 @@ L_curve25519_avx2_bits: xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rsp), %rdx + movq 40(%rsp), %rdx mulxq 8(%r8), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 - # A[3] * B[1] - movq 8(%rsp), %rdx + # A[1] * B[3] + movq 56(%rsp), %rdx adoxq %rcx, %r12 - mulxq 24(%r8), %rcx, %rbx + mulxq 8(%r8), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rsp), %rdx + movq 48(%rsp), %rdx mulxq 16(%r8), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rsp), %rdx + movq 56(%rsp), %rdx adoxq %rcx, %r14 mulxq 24(%r8), %rcx, %rbx adoxq %rbp, %r15 @@ -11556,34 +10754,33 @@ L_curve25519_avx2_bits: xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq (%rsp), %rdx + movq 24(%r8), %rdx adcxq %rcx, %r13 - mulxq 24(%r8), %rdx, %rcx + mulxq 32(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 - # A[2] * B[3] - movq 24(%rsp), %rdx - mulxq 16(%r8), %rdx, %rcx - adcxq %rdx, %r14 # A[3] * B[2] - movq 16(%rsp), %rdx + movq 24(%r8), %rdx + mulxq 48(%rsp), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 56(%rsp), %rdx adcxq %rcx, %r15 - mulxq 24(%r8), %rcx, %rdx + mulxq 16(%r8), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rbx, %rbx + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 @@ -11593,28 +10790,7 @@ L_curve25519_avx2_bits: mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rbx, %r12 # Store movq %r9, 32(%rsp) movq %r10, 40(%rsp) @@ -11630,12 +10806,11 @@ L_curve25519_avx2_bits: mulxq 136(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 - # A[1] * B[3] - movq 120(%rsp), %rdx - mulxq 136(%rsp), %r13, %r14 + # A[3] * B[1] + movq 104(%rsp), %rdx + mulxq 152(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - movq 104(%rsp), %rdx mulxq 128(%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] @@ -11659,10 +10834,10 @@ L_curve25519_avx2_bits: mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 - # A[3] * B[1] - movq 104(%rsp), %rdx + # A[1] * B[3] + movq 120(%rsp), %rdx adoxq %rcx, %r12 - mulxq 152(%rsp), %rcx, %rbx + mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 112(%rsp), %rdx @@ -11681,34 +10856,33 @@ L_curve25519_avx2_bits: xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq 96(%rsp), %rdx + movq 152(%rsp), %rdx adcxq %rcx, %r13 - mulxq 152(%rsp), %rdx, %rcx + mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 + # A[3] * B[2] + movq 152(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx + adcxq %rdx, %r14 # A[2] * B[3] movq 120(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 112(%rsp), %rdx adcxq %rcx, %r15 - mulxq 152(%rsp), %rcx, %rdx + mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rbx, %rbx + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 @@ -11718,38 +10892,15 @@ L_curve25519_avx2_bits: mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set - movq %r12, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rbx, %r12 - addq %rdx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 + adcxq %rbx, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) - decb 168(%rsp) + movq 160(%rsp), %rbx + decq %rbx jge L_curve25519_avx2_bits - movq $63, 168(%rsp) - decb 160(%rsp) - jge L_curve25519_avx2_words # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi @@ -11987,7 +11138,7 @@ L_curve25519_avx2_bits: #else callq _fe_mul_avx2 #endif /* __APPLE__ */ - movq 176(%rsp), %rdi + movq 168(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rsp), %rdx @@ -11998,12 +11149,11 @@ L_curve25519_avx2_bits: mulxq 8(%rdi), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 - # A[1] * B[3] - movq 24(%rsp), %rdx - mulxq 8(%rdi), %r13, %r14 + # A[3] * B[1] + movq 8(%rsp), %rdx + mulxq 24(%rdi), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - movq 8(%rsp), %rdx mulxq (%rdi), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] @@ -12027,10 +11177,10 @@ L_curve25519_avx2_bits: mulxq 8(%rdi), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 - # A[3] * B[1] - movq 8(%rsp), %rdx + # A[1] * B[3] + movq 24(%rsp), %rdx adoxq %rcx, %r12 - mulxq 24(%rdi), %rcx, %rbx + mulxq 8(%rdi), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 16(%rsp), %rdx @@ -12049,34 +11199,33 @@ L_curve25519_avx2_bits: xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq (%rsp), %rdx + movq 24(%rdi), %rdx adcxq %rcx, %r13 - mulxq 24(%rdi), %rdx, %rcx + mulxq (%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 + # A[3] * B[2] + movq 24(%rdi), %rdx + mulxq 16(%rsp), %rdx, %rcx + adcxq %rdx, %r14 # A[2] * B[3] movq 24(%rsp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rdx, %r14 - # A[3] * B[2] - movq 16(%rsp), %rdx adcxq %rcx, %r15 - mulxq 24(%rdi), %rcx, %rdx + mulxq 16(%rdi), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp - # Reduce + movq $38, %rdx + mulxq %rbp, %rbp, %rcx + addq %rbp, %r12 + adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r15, %rbp - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx andq %rbx, %r12 - # Multiply top half by 19 - movq $19, %rdx xorq %rbx, %rbx + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 @@ -12086,20 +11235,8 @@ L_curve25519_avx2_bits: mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - mulxq %rbp, %rbp, %rdx - adcxq %rbp, %r12 - adoxq %rbx, %rdx - adcxq %rbx, %rdx - # Overflow - shldq $0x01, %r12, %rdx + adcxq %rbx, %r12 movq $0x7fffffffffffffff, %rbx - imulq $19, %rdx, %rcx - andq %rbx, %r12 - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx @@ -12108,7 +11245,7 @@ L_curve25519_avx2_bits: adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 - movq $0x7fffffffffffffff, %rbx + movq $0x7fffffffffffffff, %rcx movq %r9, %rdx addq $19, %rdx movq %r10, %rdx @@ -12119,18 +11256,18 @@ L_curve25519_avx2_bits: adcq $0x00, %rdx sarq $63, %rdx andq $19, %rdx + andq %rcx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 - andq %rbx, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) xorq %rax, %rax - addq $0xc0, %rsp + addq $0xb8, %rsp popq %rbp popq %r15 popq %r14 @@ -12141,6 +11278,126 @@ L_curve25519_avx2_bits: #ifndef __APPLE__ .size curve25519_avx2,.-curve25519_avx2 #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text +.globl fe_sq2_avx2 +.type fe_sq2_avx2,@function +.align 16 +fe_sq2_avx2: +#else +.section __TEXT,__text +.globl _fe_sq2_avx2 +.p2align 4 +_fe_sq2_avx2: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + # Square * 2 + movq (%rsi), %rdx + movq 8(%rsi), %rax + # A[0] * A[1] + movq %rdx, %r15 + mulxq %rax, %r9, %r10 + # A[0] * A[3] + mulxq 24(%rsi), %r11, %r12 + # A[2] * A[1] + movq 16(%rsi), %rdx + mulxq %rax, %rcx, %rbx + xorq %r8, %r8 + adoxq %rcx, %r11 + # A[2] * A[3] + mulxq 24(%rsi), %r13, %r14 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq %r15, %rcx, %rbx + adoxq %r8, %r13 + adcxq %rcx, %r10 + adoxq %r8, %r14 + # A[1] * A[3] + movq %rax, %rdx + mulxq 24(%rsi), %rcx, %rdx + adcxq %rbx, %r11 + adcxq %rcx, %r12 + adcxq %rdx, %r13 + adcxq %r8, %r14 + # A[0] * A[0] + movq %r15, %rdx + mulxq %rdx, %r8, %rcx + xorq %r15, %r15 + adcxq %r9, %r9 + # A[1] * A[1] + movq %rax, %rdx + adoxq %rcx, %r9 + mulxq %rdx, %rcx, %rbx + adcxq %r10, %r10 + adoxq %rcx, %r10 + adcxq %r11, %r11 + # A[2] * A[2] + movq 16(%rsi), %rdx + adoxq %rbx, %r11 + mulxq %rdx, %rbx, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r12 + adcxq %r13, %r13 + # A[3] * A[3] + movq 24(%rsi), %rdx + adoxq %rcx, %r13 + mulxq %rdx, %rcx, %rbx + adcxq %r14, %r14 + adoxq %rcx, %r14 + adcxq %r15, %r15 + adoxq %rbx, %r15 + movq $38, %rdx + mulxq %r15, %r15, %rax + addq %r15, %r11 + adcq $0x00, %rax + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r11, %rax + imulq $19, %rax, %rax + andq %rcx, %r11 + xorq %rcx, %rcx + adoxq %rax, %r8 + mulxq %r12, %rax, %r12 + adcxq %rax, %r8 + adoxq %r12, %r9 + mulxq %r13, %rax, %r13 + adcxq %rax, %r9 + adoxq %r13, %r10 + mulxq %r14, %rax, %r14 + adcxq %rax, %r10 + adoxq %r14, %r11 + adcxq %rcx, %r11 + mov %r11, %rax + shldq $0x01, %r10, %r11 + shldq $0x01, %r9, %r10 + shldq $0x01, %r8, %r9 + shlq $0x01, %r8 + movq $0x7fffffffffffffff, %rcx + shrq $62, %rax + andq %rcx, %r11 + imulq $19, %rax, %rax + addq %rax, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + # Store + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size fe_sq2_avx2,.-fe_sq2_avx2 +#endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523_avx2 @@ -12218,7 +11475,7 @@ _fe_pow22523_avx2: #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi - movb $4, %dl + movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else @@ -12241,7 +11498,7 @@ _fe_pow22523_avx2: #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi - movb $9, %dl + movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else @@ -12264,7 +11521,7 @@ _fe_pow22523_avx2: #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi - movb $19, %dl + movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else @@ -12287,7 +11544,7 @@ _fe_pow22523_avx2: #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi - movb $9, %dl + movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else @@ -12310,7 +11567,7 @@ _fe_pow22523_avx2: #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi - movb $49, %dl + movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else @@ -12333,7 +11590,7 @@ _fe_pow22523_avx2: #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi - movb $0x63, %dl + movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else @@ -12356,7 +11613,7 @@ _fe_pow22523_avx2: #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi - movb $49, %dl + movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else @@ -12398,4197 +11655,3916 @@ _fe_pow22523_avx2: repz retq #ifndef __APPLE__ .text -.globl fe_ge_to_p2_avx2 -.type fe_ge_to_p2_avx2,@function +.globl ge_p1p1_to_p2_avx2 +.type ge_p1p1_to_p2_avx2,@function .align 16 -fe_ge_to_p2_avx2: +ge_p1p1_to_p2_avx2: #else .section __TEXT,__text -.globl _fe_ge_to_p2_avx2 +.globl _ge_p1p1_to_p2_avx2 .p2align 4 -_fe_ge_to_p2_avx2: +_ge_p1p1_to_p2_avx2: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 16(%rsp), %rsi - movq 88(%rsp), %rbx + pushq %rbx + subq $16, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + leaq 96(%rsi), %rax # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r9, %r10 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r11, %r12 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %rcx, %r8 + xorq %rbx, %rbx + adcxq %rcx, %r10 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r13, %r14 + adcxq %r8, %r11 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %rcx, %r8 + adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %rcx, %r15 + adoxq %r8, %r11 + adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq 8(%rsi), %rcx, %r8 adcxq %r15, %r13 - adoxq %rax, %r12 + adoxq %rcx, %r12 + adcxq %rbx, %r14 + adoxq %r8, %r13 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %rcx, %r8 + adoxq %rbx, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rbx), %rdx + movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 + adcxq %r8, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %rcx, %r12 + mulxq 8(%rsi), %rcx, %r8 + adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rbx), %rdx + movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + adcxq %r8, %r14 + adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %r8 + adoxq %rbx, %r15 + adcxq %rcx, %r15 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + adcxq %r8, %rbx + xorq %r8, %r8 + adcxq %rdx, %r12 # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 + movq 24(%rsi), %rdx + adcxq %rcx, %r13 + mulxq (%rax), %rdx, %rcx + adoxq %rdx, %r12 adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %rcx, %r15 + mulxq 16(%rsi), %rcx, %rdx + adcxq %r8, %rbx + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %r8, %rbx + movq $38, %rdx + mulxq %rbx, %rbx, %rcx + addq %rbx, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %r8, %r12 + xorq %r8, %r8 + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %r8, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r9, %r10 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r11, %r12 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %rcx, %r8 + xorq %rbx, %rbx + adcxq %rcx, %r10 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r13, %r14 + adcxq %r8, %r11 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %rcx, %r8 + adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %rcx, %r15 + adoxq %r8, %r11 + adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq 8(%rsi), %rcx, %r8 adcxq %r15, %r13 - adoxq %rax, %r12 + adoxq %rcx, %r12 + adcxq %rbx, %r14 + adoxq %r8, %r13 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %rcx, %r8 + adoxq %rbx, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rbx), %rdx + movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 + adcxq %r8, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %rcx, %r12 + mulxq 8(%rsi), %rcx, %r8 + adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rbx), %rdx + movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + adcxq %r8, %r14 + adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %r8 + adoxq %rbx, %r15 + adcxq %rcx, %r15 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + adcxq %r8, %rbx + xorq %r8, %r8 + adcxq %rdx, %r12 # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 + movq 24(%rsi), %rdx + adcxq %rcx, %r13 + mulxq (%rax), %rdx, %rcx + adoxq %rdx, %r12 adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %rcx, %r15 + mulxq 16(%rsi), %rcx, %rdx + adcxq %r8, %rbx + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %r8, %rbx + movq $38, %rdx + mulxq %rbx, %rbx, %rcx + addq %rbx, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %r8, %r12 + xorq %r8, %r8 + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %r8, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 88(%rsp), %rsi + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + leaq -32(%rsi), %rax + leaq -32(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rsi), %rdx - mulxq (%rbx), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r9, %r10 # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 + mulxq 16(%rsi), %r11, %r12 # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rsi), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %rcx, %r8 + xorq %rbx, %rbx + adcxq %rcx, %r10 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r13, %r14 + adcxq %r8, %r11 # A[0] * B[1] - movq 8(%rsi), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %rcx, %r8 + adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %rcx, %r15 + adoxq %r8, %r11 + adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rsi), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq 8(%rsi), %rcx, %r8 adcxq %r15, %r13 - adoxq %rax, %r12 + adoxq %rcx, %r12 + adcxq %rbx, %r14 + adoxq %r8, %r13 # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %rcx, %r8 + adoxq %rbx, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rsi), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rsi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 + movq 8(%rax), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %r8, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %rcx, %r12 + mulxq 8(%rsi), %rcx, %r8 + adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + movq 16(%rax), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %r8, %r14 + adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rsi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %r8 + adoxq %rbx, %r15 + adcxq %rcx, %r15 # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + mulxq (%rsi), %rdx, %rcx + adcxq %r8, %rbx + xorq %r8, %r8 + adcxq %rdx, %r12 # A[3] * B[0] - movq (%rsi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] movq 24(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rsi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 + adcxq %rcx, %r13 + mulxq (%rax), %rdx, %rcx + adoxq %rdx, %r12 adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %rcx, %r15 + mulxq 16(%rsi), %rcx, %rdx + adcxq %r8, %rbx + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %r8, %rbx + movq $38, %rdx + mulxq %rbx, %rbx, %rcx + addq %rbx, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %r8, %r12 + xorq %r8, %r8 + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %r8, %r12 + # Store + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $16, %rsp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2 +.size ge_p1p1_to_p2_avx2,.-ge_p1p1_to_p2_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_to_p3_avx2 -.type fe_ge_to_p3_avx2,@function +.globl ge_p1p1_to_p3_avx2 +.type ge_p1p1_to_p3_avx2,@function .align 16 -fe_ge_to_p3_avx2: +ge_p1p1_to_p3_avx2: #else .section __TEXT,__text -.globl _fe_ge_to_p3_avx2 +.globl _ge_p1p1_to_p3_avx2 .p2align 4 -_fe_ge_to_p3_avx2: +_ge_p1p1_to_p3_avx2: #endif /* __APPLE__ */ - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $40, %rsp - movq %rsi, (%rsp) - movq %rdx, 8(%rsp) - movq %rcx, 16(%rsp) - movq %r8, 24(%rsp) - movq %r9, 32(%rsp) - movq 24(%rsp), %rsi - movq 96(%rsp), %rbx + pushq %rbx + subq $16, %rsp + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + leaq 96(%rsi), %rax # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r9, %r10 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r11, %r12 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %rcx, %r8 + xorq %rbx, %rbx + adcxq %rcx, %r10 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r13, %r14 + adcxq %r8, %r11 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %rcx, %r8 + adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %rcx, %r15 + adoxq %r8, %r11 + adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq 8(%rsi), %rcx, %r8 adcxq %r15, %r13 - adoxq %rax, %r12 + adoxq %rcx, %r12 + adcxq %rbx, %r14 + adoxq %r8, %r13 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %rcx, %r8 + adoxq %rbx, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rbx), %rdx + movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 + adcxq %r8, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %rcx, %r12 + mulxq 8(%rsi), %rcx, %r8 + adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rbx), %rdx + movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + adcxq %r8, %r14 + adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %r8 + adoxq %rbx, %r15 + adcxq %rcx, %r15 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + adcxq %r8, %rbx + xorq %r8, %r8 + adcxq %rdx, %r12 # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 + movq 24(%rsi), %rdx + adcxq %rcx, %r13 + mulxq (%rax), %rdx, %rcx + adoxq %rdx, %r12 adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %rcx, %r15 + mulxq 16(%rsi), %rcx, %rdx + adcxq %r8, %rbx + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %r8, %rbx + movq $38, %rdx + mulxq %rbx, %rbx, %rcx + addq %rbx, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %r8, %r12 + xorq %r8, %r8 + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %r8, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq (%rsp), %rdi - movq 32(%rsp), %rsi - movq 88(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + leaq 32(%rsi), %rax + leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r9, %r10 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r11, %r12 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %rcx, %r8 + xorq %rbx, %rbx + adcxq %rcx, %r10 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r13, %r14 + adcxq %r8, %r11 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %rcx, %r8 + adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %rcx, %r15 + adoxq %r8, %r11 + adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq 8(%rsi), %rcx, %r8 adcxq %r15, %r13 - adoxq %rax, %r12 + adoxq %rcx, %r12 + adcxq %rbx, %r14 + adoxq %r8, %r13 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %rcx, %r8 + adoxq %rbx, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rbx), %rdx + movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 + adcxq %r8, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %rcx, %r12 + mulxq 8(%rsi), %rcx, %r8 + adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rbx), %rdx + movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + adcxq %r8, %r14 + adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %r8 + adoxq %rbx, %r15 + adcxq %rcx, %r15 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + adcxq %r8, %rbx + xorq %r8, %r8 + adcxq %rdx, %r12 # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 + movq 24(%rsi), %rdx + adcxq %rcx, %r13 + mulxq (%rax), %rdx, %rcx + adoxq %rdx, %r12 adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %rcx, %r15 + mulxq 16(%rsi), %rcx, %rdx + adcxq %r8, %rbx + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %r8, %rbx + movq $38, %rdx + mulxq %rbx, %rbx, %rcx + addq %rbx, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %r8, %r12 + xorq %r8, %r8 + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %r8, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq 96(%rsp), %rsi + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + leaq 64(%rsi), %rsi + leaq -64(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rsi), %rdx - mulxq (%rbx), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r9, %r10 # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 + mulxq 16(%rsi), %r11, %r12 # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rsi), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rsi), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 + mulxq 8(%rsi), %rcx, %r8 + xorq %rbx, %rbx + adcxq %rcx, %r10 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r13, %r14 + adcxq %r8, %r11 + # A[0] * B[1] + mulxq (%rsi), %rcx, %r8 + adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %rcx, %r15 + adoxq %r8, %r11 + adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rsi), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq 8(%rsi), %rcx, %r8 adcxq %r15, %r13 - adoxq %rax, %r12 + adoxq %rcx, %r12 + adcxq %rbx, %r14 + adoxq %r8, %r13 # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %rcx, %r8 + adoxq %rbx, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rsi), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rsi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 + movq 8(%rax), %rdx + mulxq 8(%rsi), %rdx, %rcx + adcxq %r8, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %rcx, %r12 + mulxq 8(%rsi), %rcx, %r8 + adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + movq 16(%rax), %rdx + mulxq 16(%rsi), %rdx, %rcx + adcxq %r8, %r14 + adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rsi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %r8 + adoxq %rbx, %r15 + adcxq %rcx, %r15 # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + mulxq (%rsi), %rdx, %rcx + adcxq %r8, %rbx + xorq %r8, %r8 + adcxq %rdx, %r12 # A[3] * B[0] - movq (%rsi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] movq 24(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rsi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 + adcxq %rcx, %r13 + mulxq (%rax), %rdx, %rcx + adoxq %rdx, %r12 adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %rcx, %r15 + mulxq 16(%rsi), %rcx, %rdx + adcxq %r8, %rbx + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %r8, %rbx + movq $38, %rdx + mulxq %rbx, %rbx, %rcx + addq %rbx, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %r8, %r12 + xorq %r8, %r8 + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %r8, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 24(%rsp), %rsi - movq 32(%rsp), %rbx + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + leaq 32(%rsi), %rax + leaq 32(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r9, %r10 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r11, %r12 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %rcx, %r8 + xorq %rbx, %rbx + adcxq %rcx, %r10 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r13, %r14 + adcxq %r8, %r11 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %rcx, %r8 + adoxq %rcx, %r10 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %rcx, %r15 + adoxq %r8, %r11 + adcxq %rcx, %r12 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq 8(%rsi), %rcx, %r8 adcxq %r15, %r13 - adoxq %rax, %r12 + adoxq %rcx, %r12 + adcxq %rbx, %r14 + adoxq %r8, %r13 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %rcx, %r8 + adoxq %rbx, %r14 + xorq %r15, %r15 + adcxq %rcx, %r11 # A[1] * B[1] - movq 8(%rbx), %rdx + movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 + adcxq %r8, %r12 + adoxq %rdx, %r11 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %rcx, %r12 + mulxq 8(%rsi), %rcx, %r8 + adcxq %rcx, %r13 # A[2] * B[2] - movq 16(%rbx), %rdx + movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + adcxq %r8, %r14 + adoxq %rdx, %r13 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %rcx, %r14 + mulxq 24(%rsi), %rcx, %r8 + adoxq %rbx, %r15 + adcxq %rcx, %r15 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + adcxq %r8, %rbx + xorq %r8, %r8 + adcxq %rdx, %r12 # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 + movq 24(%rsi), %rdx + adcxq %rcx, %r13 + mulxq (%rax), %rdx, %rcx + adoxq %rdx, %r12 adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %rcx + adcxq %rdx, %r14 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %rcx, %r15 + mulxq 16(%rsi), %rcx, %rdx + adcxq %r8, %rbx + adoxq %rcx, %r14 + adoxq %rdx, %r15 + adoxq %r8, %rbx + movq $38, %rdx + mulxq %rbx, %rbx, %rcx + addq %rbx, %r12 + adcq $0x00, %rcx + movq $0x7fffffffffffffff, %r8 + shldq $0x01, %r12, %rcx + imulq $19, %rcx, %rcx + andq %r8, %r12 + xorq %r8, %r8 + adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %rcx, %r15 + adcxq %rcx, %r11 + adoxq %r15, %r12 + adcxq %r8, %r12 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - addq $40, %rsp + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + addq $16, %rsp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2 +.size ge_p1p1_to_p3_avx2,.-ge_p1p1_to_p3_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_dbl_avx2 -.type fe_ge_dbl_avx2,@function +.globl ge_p2_dbl_avx2 +.type ge_p2_dbl_avx2,@function .align 16 -fe_ge_dbl_avx2: +ge_p2_dbl_avx2: #else .section __TEXT,__text -.globl _fe_ge_dbl_avx2 +.globl _ge_p2_dbl_avx2 .p2align 4 -_fe_ge_dbl_avx2: +_ge_p2_dbl_avx2: #endif /* __APPLE__ */ - pushq %rbp - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $48, %rsp + pushq %rbx + pushq %rbp + subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 32(%rsp), %rsi + leaq 64(%rdi), %rdi # Square - # A[0] * A[1] movq (%rsi), %rdx - mulxq 8(%rsi), %r9, %r10 + movq 8(%rsi), %r9 + # A[0] * A[1] + movq %rdx, %rbp + mulxq %r9, %r11, %r12 # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 + mulxq 24(%rsi), %r13, %r14 # A[2] * A[1] movq 16(%rsi), %rdx - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + xorq %r10, %r10 + adoxq %rcx, %r13 # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - adoxq %rax, %r12 + mulxq 24(%rsi), %r15, %rbx + adoxq %r8, %r14 # A[2] * A[0] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 + mulxq %rbp, %rcx, %r8 + adoxq %r10, %r15 + adcxq %rcx, %r12 + adoxq %r10, %rbx # A[1] * A[3] - movq 8(%rsi), %rdx - mulxq 24(%rsi), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 + movq %r9, %rdx + mulxq 24(%rsi), %rcx, %rdx adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 + adcxq %rcx, %r14 + adcxq %rdx, %r15 + adcxq %r10, %rbx # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 + movq %rbp, %rdx + mulxq %rdx, %r10, %rcx + xorq %rbp, %rbp adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rbp, %rcx - adcxq %r12, %r12 - adoxq %rax, %r11 + # A[1] * A[1] + movq %r9, %rdx + adoxq %rcx, %r11 + mulxq %rdx, %rcx, %r8 + adcxq %r12, %r12 + adoxq %rcx, %r12 adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rbp, %rax + # A[2] * A[2] + movq 16(%rsi), %rdx + adoxq %r8, %r13 + mulxq %rdx, %r8, %rcx adcxq %r14, %r14 - adoxq %rcx, %r13 + adoxq %r8, %r14 adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce + # A[3] * A[3] + movq 24(%rsi), %rdx + adoxq %rcx, %r15 + mulxq %rdx, %rcx, %r8 + adcxq %rbx, %rbx + adoxq %rcx, %rbx + adcxq %rbp, %rbp + adoxq %r8, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %rcx, %r13 xorq %rcx, %rcx - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 16(%rsp), %rdi - movq 40(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq 32(%rsi), %rsi # Square + movq (%rsi), %rdx + movq 8(%rsi), %r9 # A[0] * A[1] - movq (%rbx), %rdx - mulxq 8(%rbx), %r9, %r10 + movq %rdx, %rbp + mulxq %r9, %r11, %r12 # A[0] * A[3] - mulxq 24(%rbx), %r11, %r12 + mulxq 24(%rsi), %r13, %r14 # A[2] * A[1] - movq 16(%rbx), %rdx - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 + movq 16(%rsi), %rdx + mulxq %r9, %rcx, %r8 + xorq %r10, %r10 + adoxq %rcx, %r13 # A[2] * A[3] - mulxq 24(%rbx), %r13, %r14 - adoxq %rax, %r12 + mulxq 24(%rsi), %r15, %rbx + adoxq %r8, %r14 # A[2] * A[0] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 + mulxq %rbp, %rcx, %r8 + adoxq %r10, %r15 + adcxq %rcx, %r12 + adoxq %r10, %rbx # A[1] * A[3] - movq 8(%rbx), %rdx - mulxq 24(%rbx), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 + movq %r9, %rdx + mulxq 24(%rsi), %rcx, %rdx adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 + adcxq %rcx, %r14 + adcxq %rdx, %r15 + adcxq %r10, %rbx # A[0] * A[0] - movq (%rbx), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rbx), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 + movq %rbp, %rdx + mulxq %rdx, %r10, %rcx + xorq %rbp, %rbp adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rbx), %rdx - mulxq %rdx, %rbp, %rcx + # A[1] * A[1] + movq %r9, %rdx + adoxq %rcx, %r11 + mulxq %rdx, %rcx, %r8 adcxq %r12, %r12 - adoxq %rax, %r11 + adoxq %rcx, %r12 adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rbx), %rdx - mulxq %rdx, %rbp, %rax + # A[2] * A[2] + movq 16(%rsi), %rdx + adoxq %r8, %r13 + mulxq %rdx, %r8, %rcx adcxq %r14, %r14 - adoxq %rcx, %r13 + adoxq %r8, %r14 adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce + # A[3] * A[3] + movq 24(%rsi), %rdx + adoxq %rcx, %r15 + mulxq %rdx, %rcx, %r8 + adcxq %rbx, %rbx + adoxq %rcx, %rbx + adcxq %rbp, %rbp + adoxq %r8, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %rcx, %r13 xorq %rcx, %rcx - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi + movq %rdi, %rsi + leaq -32(%rdi), %rdi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq (%rbx), %r8 - movq 16(%rsi), %r10 - adcq 8(%rbx), %r9 - movq 24(%rsi), %rdx - adcq 16(%rbx), %r10 - movq $-19, %rcx - adcq 24(%rbx), %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + movq %r10, %r14 + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r13, %rcx + movq $0x7fffffffffffffff, %r8 + imulq $19, %rcx + andq %r8, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 24(%rsp), %rsi + addq %rcx, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + # Sub + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %rcx, %rcx + shldq $0x01, %rbp, %rcx + imulq $-19, %rcx + andq %r8, %rbp + # Add modulus (if underflow) + subq %rcx, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 8(%rsp), %rax + leaq 32(%rax), %rsi + leaq -32(%rdi), %rdi + # Add + movq (%rsi), %r10 + movq 8(%rsi), %r11 + addq (%rax), %r10 + movq 16(%rsi), %r12 + adcq 8(%rax), %r11 + movq 24(%rsi), %r13 + adcq 16(%rax), %r12 + adcq 24(%rax), %r13 + movq $0x00, %rcx + adcq $0x00, %rcx + shldq $0x01, %r13, %rcx + movq $0x7fffffffffffffff, %r8 + imulq $19, %rcx + andq %r8, %r13 + # Sub modulus (if overflow) + addq %rcx, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) # Square - # A[0] * A[1] movq (%rdi), %rdx - mulxq 8(%rdi), %r9, %r10 + movq 8(%rdi), %r9 + # A[0] * A[1] + movq %rdx, %rbp + mulxq %r9, %r11, %r12 # A[0] * A[3] - mulxq 24(%rdi), %r11, %r12 + mulxq 24(%rdi), %r13, %r14 # A[2] * A[1] movq 16(%rdi), %rdx - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + xorq %r10, %r10 + adoxq %rcx, %r13 # A[2] * A[3] - mulxq 24(%rdi), %r13, %r14 - adoxq %rax, %r12 + mulxq 24(%rdi), %r15, %rbx + adoxq %r8, %r14 # A[2] * A[0] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 + mulxq %rbp, %rcx, %r8 + adoxq %r10, %r15 + adcxq %rcx, %r12 + adoxq %r10, %rbx # A[1] * A[3] - movq 8(%rdi), %rdx - mulxq 24(%rdi), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 + movq %r9, %rdx + mulxq 24(%rdi), %rcx, %rdx adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 + adcxq %rcx, %r14 + adcxq %rdx, %r15 + adcxq %r10, %rbx # A[0] * A[0] - movq (%rdi), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rdi), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 + movq %rbp, %rdx + mulxq %rdx, %r10, %rcx + xorq %rbp, %rbp adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rdi), %rdx - mulxq %rdx, %rbp, %rcx + # A[1] * A[1] + movq %r9, %rdx + adoxq %rcx, %r11 + mulxq %rdx, %rcx, %r8 adcxq %r12, %r12 - adoxq %rax, %r11 + adoxq %rcx, %r12 adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rdi), %rdx - mulxq %rdx, %rbp, %rax + # A[2] * A[2] + movq 16(%rdi), %rdx + adoxq %r8, %r13 + mulxq %rdx, %r8, %rcx adcxq %r14, %r14 - adoxq %rcx, %r13 + adoxq %r8, %r14 adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce + # A[3] * A[3] + movq 24(%rdi), %rdx + adoxq %rcx, %r15 + mulxq %rdx, %rcx, %r8 + adcxq %rbx, %rbx + adoxq %rcx, %rbx + adcxq %rbp, %rbp + adoxq %r8, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 movq $0x7fffffffffffffff, %rcx - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rcx, %r11 - # Multiply top half by 19 - movq $19, %rdx + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %rcx, %r13 xorq %rcx, %rcx - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rcx, %rdx - adcxq %rcx, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rcx - imulq $19, %rdx, %rbp - andq %rcx, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rcx, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 16(%rsp), %rsi - movq (%rsp), %rbx - # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %rdx - movq %r8, %r12 - addq (%rbx), %r8 - movq %r9, %r13 - adcq 8(%rbx), %r9 - movq %r10, %r14 - adcq 16(%rbx), %r10 - movq %rdx, %r15 - adcq 24(%rbx), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbx), %r12 - movq $0x00, %rdx - sbbq 8(%rbx), %r13 - movq $-19, %rcx - sbbq 16(%rbx), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbx), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 24(%rsp), %rsi + leaq 32(%rdi), %rsi # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rdi), %r8 - movq $0x00, %rdx - sbbq 8(%rdi), %r9 - movq $-19, %rcx - sbbq 16(%rdi), %r10 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r11 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r10 + sbbq 8(%rsi), %r11 + sbbq 16(%rsi), %r12 + sbbq 24(%rsi), %r13 + sbbq %rcx, %rcx + shldq $0x01, %r13, %rcx + movq $0x7fffffffffffffff, %r8 + imulq $-19, %rcx + andq %r8, %r13 # Add modulus (if underflow) - addq %rcx, %r8 - adcq %rdx, %r9 - adcq %rdx, %r10 - adcq %rax, %r11 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 104(%rsp), %rdi + subq %rcx, %r10 + sbbq $0x00, %r11 + sbbq $0x00, %r12 + sbbq $0x00, %r13 + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq 64(%rax), %rax # Square * 2 + movq (%rax), %rdx + movq 8(%rax), %r9 # A[0] * A[1] - movq (%rdi), %rdx - mulxq 8(%rdi), %r9, %r10 + movq %rdx, %rbp + mulxq %r9, %r11, %r12 # A[0] * A[3] - mulxq 24(%rdi), %r11, %r12 + mulxq 24(%rax), %r13, %r14 # A[2] * A[1] - movq 16(%rdi), %rdx - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adoxq %rcx, %r11 + movq 16(%rax), %rdx + mulxq %r9, %rcx, %r8 + xorq %r10, %r10 + adoxq %rcx, %r13 # A[2] * A[3] - mulxq 24(%rdi), %r13, %r14 - adoxq %rax, %r12 + mulxq 24(%rax), %r15, %rbx + adoxq %r8, %r14 # A[2] * A[0] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - adcxq %rcx, %r10 - adoxq %r15, %r14 + mulxq %rbp, %rcx, %r8 + adoxq %r10, %r15 + adcxq %rcx, %r12 + adoxq %r10, %rbx # A[1] * A[3] - movq 8(%rdi), %rdx - mulxq 24(%rdi), %rbp, %r8 - adcxq %rax, %r11 - adcxq %rbp, %r12 + movq %r9, %rdx + mulxq 24(%rax), %rcx, %rdx adcxq %r8, %r13 - adcxq %r15, %r14 - # Double with Carry Flag - xorq %r15, %r15 + adcxq %rcx, %r14 + adcxq %rdx, %r15 + adcxq %r10, %rbx # A[0] * A[0] - movq (%rdi), %rdx - mulxq %rdx, %r8, %rbp - adcxq %r9, %r9 - # A[1] * A[1] - movq 8(%rdi), %rdx - mulxq %rdx, %rcx, %rax - adcxq %r10, %r10 - adoxq %rbp, %r9 + movq %rbp, %rdx + mulxq %rdx, %r10, %rcx + xorq %rbp, %rbp adcxq %r11, %r11 - adoxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rdi), %rdx - mulxq %rdx, %rbp, %rcx + # A[1] * A[1] + movq %r9, %rdx + adoxq %rcx, %r11 + mulxq %rdx, %rcx, %r8 adcxq %r12, %r12 - adoxq %rax, %r11 + adoxq %rcx, %r12 adcxq %r13, %r13 - adoxq %rbp, %r12 - # A[3] * A[3] - movq 24(%rdi), %rdx - mulxq %rdx, %rbp, %rax + # A[2] * A[2] + movq 16(%rax), %rdx + adoxq %r8, %r13 + mulxq %rdx, %r8, %rcx adcxq %r14, %r14 - adoxq %rcx, %r13 + adoxq %r8, %r14 adcxq %r15, %r15 - adoxq %rbp, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - xorq %rbp, %rbp - # Move top half into t4-t7 and remove top bit from t3 and double - shldq $3, %r15, %rbp - shldq $2, %r14, %r15 - shldq $2, %r13, %r14 - shldq $2, %r12, %r13 - shldq $2, %r11, %r12 - shldq $0x01, %r10, %r11 - shldq $0x01, %r9, %r10 - shldq $0x01, %r8, %r9 - shlq $0x01, %r8 - andq %rax, %r11 - # Two out left, one in right - andq %rax, %r15 - # Multiply top bits by 19*19 - imulq $0x169, %rbp, %rcx - xorq %rax, %rax - # Multiply top half by 19 - movq $19, %rdx - adoxq %rcx, %r8 - mulxq %r12, %rbp, %r12 - adcxq %rbp, %r8 - adoxq %r12, %r9 - mulxq %r13, %rbp, %r13 - adcxq %rbp, %r9 - adoxq %r13, %r10 - mulxq %r14, %rbp, %r14 - adcxq %rbp, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rbp - andq %rax, %r11 - addq %rbp, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 + # A[3] * A[3] + movq 24(%rax), %rdx + adoxq %rcx, %r15 + mulxq %rdx, %rcx, %r8 + adcxq %rbx, %rbx + adoxq %rcx, %rbx + adcxq %rbp, %rbp + adoxq %r8, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r9 + addq %rbp, %r13 adcq $0x00, %r9 - adcq $0x00, %r10 + movq $0x7fffffffffffffff, %rcx + shldq $0x01, %r13, %r9 + imulq $19, %r9, %r9 + andq %rcx, %r13 + xorq %rcx, %rcx + adoxq %r9, %r10 + mulxq %r14, %r9, %r14 + adcxq %r9, %r10 + adoxq %r14, %r11 + mulxq %r15, %r9, %r15 + adcxq %r9, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r9, %rbx + adcxq %r9, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + mov %r13, %r9 + shldq $0x01, %r12, %r13 + shldq $0x01, %r11, %r12 + shldq $0x01, %r10, %r11 + shlq $0x01, %r10 + movq $0x7fffffffffffffff, %rcx + shrq $62, %r9 + andq %rcx, %r13 + imulq $19, %r9, %r9 + addq %r9, %r10 adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 16(%rsp), %rdi + leaq 64(%rdi), %rsi + leaq 96(%rdi), %rdi # Sub - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %r11 - subq (%rdi), %r8 - movq $0x00, %rdx - sbbq 8(%rdi), %r9 - movq $-19, %rcx - sbbq 16(%rdi), %r10 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r11 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r10 + sbbq 8(%rsi), %r11 + sbbq 16(%rsi), %r12 + sbbq 24(%rsi), %r13 + sbbq %rcx, %rcx + shldq $0x01, %r13, %rcx + movq $0x7fffffffffffffff, %r8 + imulq $-19, %rcx + andq %r8, %r13 # Add modulus (if underflow) - addq %rcx, %r8 - adcq %rdx, %r9 - adcq %rdx, %r10 - adcq %rax, %r11 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - addq $48, %rsp + subq %rcx, %r10 + sbbq $0x00, %r11 + sbbq $0x00, %r12 + sbbq $0x00, %r13 + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + addq $16, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx - popq %rbp repz retq #ifndef __APPLE__ -.size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2 +.size ge_p2_dbl_avx2,.-ge_p2_dbl_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_madd_avx2 -.type fe_ge_madd_avx2,@function +.globl ge_madd_avx2 +.type ge_madd_avx2,@function .align 16 -fe_ge_madd_avx2: +ge_madd_avx2: #else .section __TEXT,__text -.globl _fe_ge_madd_avx2 +.globl _ge_madd_avx2 .p2align 4 -_fe_ge_madd_avx2: +_ge_madd_avx2: #endif /* __APPLE__ */ - pushq %rbp - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $48, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rax + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 - movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 128(%rsp), %rbp + movq %rax, 16(%rsp) + leaq 96(%rsi), %rcx + leaq 64(%rax), %rax + leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rcx), %r10, %r11 # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 + mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rcx), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rcx), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rcx), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rcx), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rcx), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rcx), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rcx), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rcx), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rcx), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rcx), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx + mulxq (%rcx), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rcx), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rcx), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rcx), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 136(%rsp), %rdi + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %rsi, %rcx + leaq 32(%rsi), %rax + leaq -64(%rdi), %rsi + leaq -96(%rdi), %rdi + # Add-Sub + # Add + movq (%rax), %r10 + movq 8(%rax), %r11 + movq 16(%rax), %r12 + movq 24(%rax), %r13 + movq %r10, %r14 + addq (%rcx), %r10 + movq %r11, %r15 + adcq 8(%rcx), %r11 + movq %r12, %rbx + adcq 16(%rcx), %r12 + movq %r13, %rbp + adcq 24(%rcx), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 + # Sub modulus (if overflow) + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + # Sub + subq (%rcx), %r14 + sbbq 8(%rcx), %r15 + sbbq 16(%rcx), %rbx + sbbq 24(%rcx), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp + # Add modulus (if underflow) + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rax # Multiply # A[0] * B[0] - movq (%rdi), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rdi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rdi), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rdi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rdi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rdi), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rdi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rdi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rdi), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rdi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rdi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rdi), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rdi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rdi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rdi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rdi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rdi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rdi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + mulxq (%rdi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 + adcxq %rdx, %r13 # A[3] * B[0] - movq (%rdi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] movq 24(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rdi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rdi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rdi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rdi - movq 120(%rsp), %rsi - movq 112(%rsp), %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq 32(%rax), %rax # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rsi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rsi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rsi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rsi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rsi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx + mulxq (%rsi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rsi), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rsi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rdi - movq (%rsp), %rsi + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + # Add-Sub # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 + movq (%rdi), %r10 + movq 8(%rdi), %r11 + movq 16(%rdi), %r12 + movq 24(%rdi), %r13 movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 104(%rsp), %rdi + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + leaq 64(%rcx), %rcx # Double - movq (%rdi), %r8 - movq 8(%rdi), %r9 - addq %r8, %r8 - movq 16(%rdi), %r10 - adcq %r9, %r9 - movq 24(%rdi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + movq (%rcx), %r10 + movq 8(%rcx), %r11 + addq %r10, %r10 + movq 16(%rcx), %r12 + adcq %r11, %r11 + movq 24(%rcx), %r13 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 24(%rsp), %rdi + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + leaq 96(%rdi), %rsi + leaq 64(%rdi), %rdi + # Add-Sub # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - addq $48, %rsp + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + addq $24, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx - popq %rbp repz retq #ifndef __APPLE__ -.size fe_ge_madd_avx2,.-fe_ge_madd_avx2 +.size ge_madd_avx2,.-ge_madd_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_msub_avx2 -.type fe_ge_msub_avx2,@function +.globl ge_msub_avx2 +.type ge_msub_avx2,@function .align 16 -fe_ge_msub_avx2: +ge_msub_avx2: #else .section __TEXT,__text -.globl _fe_ge_msub_avx2 +.globl _ge_msub_avx2 .p2align 4 -_fe_ge_msub_avx2: +_ge_msub_avx2: #endif /* __APPLE__ */ - pushq %rbp - pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $48, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rax + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 - movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 136(%rsp), %rbp + movq %rax, 16(%rsp) + leaq 96(%rsi), %rcx + leaq 64(%rax), %rax + leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rcx), %r10, %r11 # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 + mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rcx), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rcx), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rcx), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rcx), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rcx), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rcx), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rcx), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rcx), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rcx), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rcx), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx + mulxq (%rcx), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rcx), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rcx), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rcx), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 128(%rsp), %rdi + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %rsi, %rcx + leaq 32(%rsi), %rax + leaq -64(%rdi), %rsi + leaq -96(%rdi), %rdi + # Add-Sub + # Add + movq (%rax), %r10 + movq 8(%rax), %r11 + movq 16(%rax), %r12 + movq 24(%rax), %r13 + movq %r10, %r14 + addq (%rcx), %r10 + movq %r11, %r15 + adcq 8(%rcx), %r11 + movq %r12, %rbx + adcq 16(%rcx), %r12 + movq %r13, %rbp + adcq 24(%rcx), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 + # Sub modulus (if overflow) + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + # Sub + subq (%rcx), %r14 + sbbq 8(%rcx), %r15 + sbbq 16(%rcx), %rbx + sbbq 24(%rcx), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp + # Add modulus (if underflow) + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rax + leaq 32(%rax), %rax # Multiply # A[0] * B[0] - movq (%rdi), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rdi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rdi), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rdi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rdi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rdi), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rdi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rdi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rdi), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rdi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rdi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rdi), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rdi), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rdi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rdi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rdi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rdi), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rdi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 + mulxq (%rdi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 + adcxq %rdx, %r13 # A[3] * B[0] - movq (%rdi), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] movq 24(%rdi), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rdx, %r13 + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rdi), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rdi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rdi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rdi - movq 120(%rsp), %rsi - movq 112(%rsp), %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq -32(%rax), %rax # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rsi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rsi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rsi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rsi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rsi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rcx + mulxq (%rsi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rsi), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rsi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 + adoxq %r14, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq 8(%rsp), %rsi - movq (%rsp), %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + # Add-Sub # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rsi), %r8 - movq %r9, %r13 - adcq 8(%rsi), %r9 + movq (%rdi), %r10 + movq 8(%rdi), %r11 + movq 16(%rdi), %r12 + movq 24(%rdi), %r13 movq %r10, %r14 - adcq 16(%rsi), %r10 - movq %rdx, %r15 - adcq 24(%rsi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rsi), %r12 - movq $0x00, %rdx - sbbq 8(%rsi), %r13 - movq $-19, %rcx - sbbq 16(%rsi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rsi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq %r12, (%rbp) - movq %r13, 8(%rbp) - movq %r14, 16(%rbp) - movq %r15, 24(%rbp) - movq 104(%rsp), %rsi + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + leaq 64(%rcx), %rcx # Double - movq (%rsi), %r8 - movq 8(%rsi), %r9 - addq %r8, %r8 - movq 16(%rsi), %r10 - adcq %r9, %r9 - movq 24(%rsi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + movq (%rcx), %r10 + movq 8(%rcx), %r11 + addq %r10, %r10 + movq 16(%rcx), %r12 + adcq %r11, %r11 + movq 24(%rcx), %r13 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + leaq 96(%rdi), %rsi + leaq 64(%rdi), %rdi + # Add-Sub # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rbx) - movq %r13, 8(%rbx) - movq %r14, 16(%rbx) - movq %r15, 24(%rbx) - addq $48, %rsp + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + addq $24, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx - popq %rbp repz retq #ifndef __APPLE__ -.size fe_ge_msub_avx2,.-fe_ge_msub_avx2 +.size ge_msub_avx2,.-ge_msub_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_add_avx2 -.type fe_ge_add_avx2,@function +.globl ge_add_avx2 +.type ge_add_avx2,@function .align 16 -fe_ge_add_avx2: +ge_add_avx2: #else .section __TEXT,__text -.globl _fe_ge_add_avx2 +.globl _ge_add_avx2 .p2align 4 -_fe_ge_add_avx2: +_ge_add_avx2: #endif /* __APPLE__ */ - pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $0x50, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rax + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp - # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 - movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax - # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 168(%rsp), %rbp + movq %rax, 16(%rsp) + leaq 96(%rsi), %rcx + leaq 96(%rax), %rax + leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rcx), %r10, %r11 # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 + mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rcx), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rcx), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rcx), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rcx), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rcx), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rcx), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rcx), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rcx), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rcx), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rcx), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx + mulxq (%rcx), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rcx), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rcx), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rcx), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 176(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %rsi, %rcx + leaq 32(%rsi), %rax + leaq -64(%rdi), %rsi + leaq -96(%rdi), %rdi + # Add-Sub + # Add + movq (%rax), %r10 + movq 8(%rax), %r11 + movq 16(%rax), %r12 + movq 24(%rax), %r13 + movq %r10, %r14 + addq (%rcx), %r10 + movq %r11, %r15 + adcq 8(%rcx), %r11 + movq %r12, %rbx + adcq 16(%rcx), %r12 + movq %r13, %rbp + adcq 24(%rcx), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 + # Sub modulus (if overflow) + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + # Sub + subq (%rcx), %r14 + sbbq 8(%rcx), %r15 + sbbq 16(%rcx), %rbx + sbbq 24(%rcx), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp + # Add modulus (if underflow) + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rax # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rdi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rdi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rdi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rdi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rdi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rdi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rdi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rdi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rdi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rdi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rdi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx + mulxq (%rdi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rdi), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rdi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rdi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rsi - movq 160(%rsp), %rbx - movq 144(%rsp), %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq 32(%rax), %rax # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rbx), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rsi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rsi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rsi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rsi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rsi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx + mulxq (%rsi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rsi), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rsi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 136(%rsp), %rsi - movq 152(%rsp), %rbx + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + leaq 64(%rcx), %rcx + leaq 32(%rax), %rax # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rcx), %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rcx), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rcx), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rcx), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rcx), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rcx), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rcx), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rcx), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rcx), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rcx), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rcx), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx + mulxq (%rcx), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rcx), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rcx), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rcx), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rsi + leaq 64(%rdi), %rdi # Double - movq (%rdi), %r8 - movq 8(%rdi), %r9 - addq %r8, %r8 - movq 16(%rdi), %r10 - adcq %r9, %r9 - movq 24(%rdi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 8(%rsp), %rbx - movq 16(%rsp), %rbp + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq -64(%rdi), %rdi + # Add-Sub # Add - movq (%rbp), %r8 - movq 8(%rbp), %r9 - movq 16(%rbp), %r10 - movq 24(%rbp), %rdx - movq %r8, %r12 - addq (%rbx), %r8 - movq %r9, %r13 - adcq 8(%rbx), %r9 + movq (%rdi), %r10 + movq 8(%rdi), %r11 + movq 16(%rdi), %r12 + movq 24(%rdi), %r13 movq %r10, %r14 - adcq 16(%rbx), %r10 - movq %rdx, %r15 - adcq 24(%rbx), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rbx), %r12 - movq $0x00, %rdx - sbbq 8(%rbx), %r13 - movq $-19, %rcx - sbbq 16(%rbx), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbx), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - movq 24(%rsp), %rdi + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + leaq 96(%rdi), %rsi + leaq 64(%rdi), %rdi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 + movq (%rdi), %r10 + movq 8(%rdi), %r11 + movq 16(%rdi), %r12 + movq 24(%rdi), %r13 movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbp) - movq %r9, 8(%rbp) - movq %r10, 16(%rbp) - movq %r11, 24(%rbp) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - addq $0x50, %rsp + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + addq $24, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbp - popq %rbx repz retq #ifndef __APPLE__ -.size fe_ge_add_avx2,.-fe_ge_add_avx2 +.size ge_add_avx2,.-ge_add_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_sub_avx2 -.type fe_ge_sub_avx2,@function +.globl ge_sub_avx2 +.type ge_sub_avx2,@function .align 16 -fe_ge_sub_avx2: +ge_sub_avx2: #else .section __TEXT,__text -.globl _fe_ge_sub_avx2 +.globl _ge_sub_avx2 .p2align 4 -_fe_ge_sub_avx2: +_ge_sub_avx2: #endif /* __APPLE__ */ - pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - subq $0x50, %rsp + pushq %rbx + pushq %rbp + movq %rdx, %rax + subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq 8(%rsp), %rsi - movq 40(%rsp), %rbx - movq 32(%rsp), %rbp + movq %rax, 16(%rsp) + leaq 96(%rsi), %rcx + leaq 96(%rax), %rax + leaq 96(%rdi), %rdi + # Multiply + # A[0] * B[0] + movq (%rax), %rdx + mulxq (%rcx), %r10, %r11 + # A[2] * B[0] + mulxq 16(%rcx), %r12, %r13 + # A[1] * B[0] + mulxq 8(%rcx), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rcx), %r14, %r15 + adcxq %r9, %r12 + # A[0] * B[1] + mulxq (%rcx), %r8, %r9 + adoxq %r8, %r11 + # A[2] * B[1] + mulxq 16(%rcx), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 + # A[1] * B[2] + movq 16(%rax), %rdx + mulxq 8(%rcx), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 + # A[0] * B[2] + mulxq (%rcx), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 + # A[1] * B[1] + movq 8(%rax), %rdx + mulxq 8(%rcx), %rdx, %r8 + adcxq %r9, %r13 + adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rcx), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rcx), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 + # A[3] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rcx), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx + # A[0] * B[3] + mulxq (%rcx), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 + adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rcx), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 + # A[3] * B[2] + movq 24(%rcx), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rcx), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 + adoxq %r14, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 + # Store + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %rsi, %rcx + leaq 32(%rsi), %rax + leaq -64(%rdi), %rsi + leaq -96(%rdi), %rdi + # Add-Sub # Add - movq (%rbx), %r8 - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %rdx - movq %r8, %r12 - addq (%rbp), %r8 - movq %r9, %r13 - adcq 8(%rbp), %r9 + movq (%rax), %r10 + movq 8(%rax), %r11 + movq 16(%rax), %r12 + movq 24(%rax), %r13 movq %r10, %r14 - adcq 16(%rbp), %r10 - movq %rdx, %r15 - adcq 24(%rbp), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rcx), %r10 + movq %r11, %r15 + adcq 8(%rcx), %r11 + movq %r12, %rbx + adcq 16(%rcx), %r12 + movq %r13, %rbp + adcq 24(%rcx), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rbp), %r12 - movq $0x00, %rdx - sbbq 8(%rbp), %r13 - movq $-19, %rcx - sbbq 16(%rbp), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbp), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rcx), %r14 + sbbq 8(%rcx), %r15 + sbbq 16(%rcx), %rbx + sbbq 24(%rcx), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, (%rsi) - movq %r13, 8(%rsi) - movq %r14, 16(%rsi) - movq %r15, 24(%rsi) - movq 16(%rsp), %rbx - movq 176(%rsp), %rbp + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + movq 16(%rsp), %rax + leaq 32(%rax), %rax # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rdi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rdi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rdi), %r10, %r11 + mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rdi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rdi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rdi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rdi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rdi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rdi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rdi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rdi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rdi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rdi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rdi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rdi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rdi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rdi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rdi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rdi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rdi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rdi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rdi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rdi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rdi), %rdx, %rcx + mulxq (%rdi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rdi), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rdi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rdi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rdi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq 168(%rsp), %rbx + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq -32(%rax), %rax # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rax), %rdx + mulxq (%rsi), %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rsi), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rsi), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rsi), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rsi), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rsi), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rsi), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rsi), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rsi), %r8, %r9 + adcxq %r8, %r14 + # A[2] * B[2] + movq 16(%rax), %rdx + mulxq 16(%rsi), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rsi), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx + mulxq (%rsi), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rsi), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rsi), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rsi), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 24(%rsp), %rsi - movq 160(%rsp), %rbx - movq 144(%rsp), %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + leaq 64(%rcx), %rcx + leaq 64(%rax), %rax # Multiply # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rbx), %r8, %r9 + movq (%rax), %rdx + mulxq (%rcx), %r10, %r11 # A[2] * B[0] - mulxq 16(%rbx), %r10, %r11 + mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] - mulxq 8(%rbx), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rbx), %r12, %r13 - adcxq %rax, %r10 + mulxq 8(%rcx), %r8, %r9 + xorq %rbp, %rbp + adcxq %r8, %r11 + # A[3] * B[1] + movq 8(%rax), %rdx + mulxq 24(%rcx), %r14, %r15 + adcxq %r9, %r12 # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rbx), %rcx, %rax - adoxq %rcx, %r9 + mulxq (%rcx), %r8, %r9 + adoxq %r8, %r11 # A[2] * B[1] - mulxq 16(%rbx), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 + mulxq 16(%rcx), %r8, %rbx + adoxq %r9, %r12 + adcxq %r8, %r13 # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rbx), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 + movq 16(%rax), %rdx + mulxq 8(%rcx), %r8, %r9 + adcxq %rbx, %r14 + adoxq %r8, %r13 + adcxq %rbp, %r15 + adoxq %r9, %r14 # A[0] * B[2] - mulxq (%rbx), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 + mulxq (%rcx), %r8, %r9 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %r8, %r12 # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rbx), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rcx, %r11 - mulxq 24(%rbx), %rcx, %rax - adcxq %rcx, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rax, %r13 + movq 8(%rax), %rdx + mulxq 8(%rcx), %rdx, %r8 + adcxq %r9, %r13 adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rcx, %r13 - mulxq 24(%rbx), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 - # A[0] * B[3] - mulxq (%rbx), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbp), %rdx - adcxq %rcx, %r12 - mulxq 24(%rbx), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbp), %rdx - mulxq 16(%rbx), %rdx, %rcx - adcxq %rdx, %r13 - # A[3] * B[2] - movq 16(%rbp), %rdx - adcxq %rcx, %r14 - mulxq 24(%rbx), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 - adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Store - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 136(%rsp), %rsi - movq 152(%rsp), %rbx - # Multiply - # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rcx, %rax - xorq %r15, %r15 - adcxq %rcx, %r9 # A[1] * B[3] - movq 24(%rbx), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rax, %r10 - # A[0] * B[1] - movq 8(%rbx), %rdx - mulxq (%rsi), %rcx, %rax - adoxq %rcx, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r14 - adoxq %rax, %r10 - adcxq %rcx, %r11 - # A[1] * B[2] - movq 16(%rbx), %rdx - mulxq 8(%rsi), %rcx, %rax - adcxq %r14, %r12 - adoxq %rcx, %r11 - adcxq %r15, %r13 - adoxq %rax, %r12 - # A[0] * B[2] - mulxq (%rsi), %rcx, %rax - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rcx, %r10 - # A[1] * B[1] - movq 8(%rbx), %rdx - mulxq 8(%rsi), %rdx, %rcx - adcxq %rax, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbx), %rdx - adoxq %rcx, %r11 - mulxq 24(%rsi), %rcx, %rax - adcxq %rcx, %r12 + movq 24(%rax), %rdx + adoxq %r8, %r13 + mulxq 8(%rcx), %r8, %r9 + adcxq %r8, %r14 # A[2] * B[2] - movq 16(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx - adcxq %rax, %r13 - adoxq %rdx, %r12 + movq 16(%rax), %rdx + mulxq 16(%rcx), %rdx, %r8 + adcxq %r9, %r15 + adoxq %rdx, %r14 # A[3] * B[3] - movq 24(%rbx), %rdx - adoxq %rcx, %r13 - mulxq 24(%rsi), %rcx, %rax - adoxq %r15, %r14 - adcxq %rcx, %r14 + movq 24(%rax), %rdx + adoxq %r8, %r15 + mulxq 24(%rcx), %r8, %r9 + adoxq %rbp, %rbx + adcxq %r8, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %rax, %r15 - xorq %rax, %rax - adcxq %rdx, %r11 - # A[3] * B[0] - movq (%rbx), %rdx - adcxq %rcx, %r12 - mulxq 24(%rsi), %rdx, %rcx - adoxq %rdx, %r11 - adoxq %rcx, %r12 - # A[2] * B[3] - movq 24(%rbx), %rdx - mulxq 16(%rsi), %rdx, %rcx + mulxq (%rcx), %rdx, %r8 + adcxq %r9, %rbp + xorq %r9, %r9 adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rcx), %rdx + adcxq %r8, %r14 + mulxq (%rax), %rdx, %r8 + adoxq %rdx, %r13 + adoxq %r8, %r14 # A[3] * B[2] - movq 16(%rbx), %rdx - adcxq %rcx, %r14 - mulxq 24(%rsi), %rcx, %rdx - adcxq %rax, %r15 - adoxq %rcx, %r13 - adoxq %rdx, %r14 - adoxq %rax, %r15 - # Reduce - movq $0x7fffffffffffffff, %rax - # Move top half into t4-t7 and remove top bit from t3 - shldq $0x01, %r14, %r15 - shldq $0x01, %r13, %r14 - shldq $0x01, %r12, %r13 - shldq $0x01, %r11, %r12 - andq %rax, %r11 - # Multiply top half by 19 - movq $19, %rdx - xorq %rax, %rax - mulxq %r12, %rcx, %r12 - adcxq %rcx, %r8 - adoxq %r12, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 - mulxq %r14, %rcx, %r14 - adcxq %rcx, %r10 + movq 24(%rcx), %rdx + mulxq 16(%rax), %rdx, %r8 + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%rax), %rdx + adcxq %r8, %rbx + mulxq 16(%rcx), %r8, %rdx + adcxq %r9, %rbp + adoxq %r8, %r15 + adoxq %rdx, %rbx + adoxq %r9, %rbp + movq $38, %rdx + mulxq %rbp, %rbp, %r8 + addq %rbp, %r13 + adcq $0x00, %r8 + movq $0x7fffffffffffffff, %r9 + shldq $0x01, %r13, %r8 + imulq $19, %r8, %r8 + andq %r9, %r13 + xorq %r9, %r9 + adoxq %r8, %r10 + mulxq %r14, %r8, %r14 + adcxq %r8, %r10 adoxq %r14, %r11 - mulxq %r15, %r15, %rdx - adcxq %r15, %r11 - adoxq %rax, %rdx - adcxq %rax, %rdx - # Overflow - shldq $0x01, %r11, %rdx - movq $0x7fffffffffffffff, %rax - imulq $19, %rdx, %rcx - andq %rax, %r11 - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - # Reduce if top bit set - movq %r11, %rdx - sarq $63, %rdx - andq $19, %rdx - andq %rax, %r11 - addq %rdx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 + mulxq %r15, %r8, %r15 + adcxq %r8, %r11 + adoxq %r15, %r12 + mulxq %rbx, %r8, %rbx + adcxq %r8, %r12 + adoxq %rbx, %r13 + adcxq %r9, %r13 # Store - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - leaq 48(%rsp), %rsi + leaq 64(%rdi), %rdi # Double - movq (%rdi), %r8 - movq 8(%rdi), %r9 - addq %r8, %r8 - movq 16(%rdi), %r10 - adcq %r9, %r9 - movq 24(%rdi), %rdx - adcq %r10, %r10 - movq $-19, %rcx - adcq %rdx, %rdx - movq $0x7fffffffffffffff, %rax - movq %rdx, %r11 - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 - movq %r8, (%rsi) - movq %r9, 8(%rsi) - movq %r10, 16(%rsi) - movq %r11, 24(%rsi) - movq 8(%rsp), %rbx - movq 16(%rsp), %rbp + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + leaq -64(%rdi), %rdi + # Add-Sub # Add - movq (%rbp), %r8 - movq 8(%rbp), %r9 - movq 16(%rbp), %r10 - movq 24(%rbp), %rdx - movq %r8, %r12 - addq (%rbx), %r8 - movq %r9, %r13 - adcq 8(%rbx), %r9 + movq (%rdi), %r10 + movq 8(%rdi), %r11 + movq 16(%rdi), %r12 + movq 24(%rdi), %r13 movq %r10, %r14 - adcq 16(%rbx), %r10 - movq %rdx, %r15 - adcq 24(%rbx), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rsi), %r10 + movq %r11, %r15 + adcq 8(%rsi), %r11 + movq %r12, %rbx + adcq 16(%rsi), %r12 + movq %r13, %rbp + adcq 24(%rsi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rbx), %r12 - movq $0x00, %rdx - sbbq 8(%rbx), %r13 - movq $-19, %rcx - sbbq 16(%rbx), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rbx), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rsi), %r14 + sbbq 8(%rsi), %r15 + sbbq 16(%rsi), %rbx + sbbq 24(%rsi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 - movq %r8, (%rbx) - movq %r9, 8(%rbx) - movq %r10, 16(%rbx) - movq %r11, 24(%rbx) - movq %r12, (%rdi) - movq %r13, 8(%rdi) - movq %r14, 16(%rdi) - movq %r15, 24(%rdi) - movq 24(%rsp), %rdi + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rsi) + movq %r11, 8(%rsi) + movq %r12, 16(%rsi) + movq %r13, 24(%rsi) + movq %r14, (%rdi) + movq %r15, 8(%rdi) + movq %rbx, 16(%rdi) + movq %rbp, 24(%rdi) + leaq 64(%rdi), %rsi + leaq 96(%rdi), %rdi + # Add-Sub # Add - movq (%rsi), %r8 - movq 8(%rsi), %r9 - movq 16(%rsi), %r10 - movq 24(%rsi), %rdx - movq %r8, %r12 - addq (%rdi), %r8 - movq %r9, %r13 - adcq 8(%rdi), %r9 + movq (%rsi), %r10 + movq 8(%rsi), %r11 + movq 16(%rsi), %r12 + movq 24(%rsi), %r13 movq %r10, %r14 - adcq 16(%rdi), %r10 - movq %rdx, %r15 - adcq 24(%rdi), %rdx - movq $-19, %rcx - movq %rdx, %r11 - movq $0x7fffffffffffffff, %rax - sarq $63, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + addq (%rdi), %r10 + movq %r11, %r15 + adcq 8(%rdi), %r11 + movq %r12, %rbx + adcq 16(%rdi), %r12 + movq %r13, %rbp + adcq 24(%rdi), %r13 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r13, %r8 + movq $0x7fffffffffffffff, %r9 + imulq $19, %r8 + andq %r9, %r13 # Sub modulus (if overflow) - subq %rcx, %r8 - sbbq %rdx, %r9 - sbbq %rdx, %r10 - sbbq %rax, %r11 + addq %r8, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 # Sub - subq (%rdi), %r12 - movq $0x00, %rdx - sbbq 8(%rdi), %r13 - movq $-19, %rcx - sbbq 16(%rdi), %r14 - movq $0x7fffffffffffffff, %rax - sbbq 24(%rdi), %r15 - sbbq $0x00, %rdx - # Mask the modulus - andq %rdx, %rcx - andq %rdx, %rax + subq (%rdi), %r14 + sbbq 8(%rdi), %r15 + sbbq 16(%rdi), %rbx + sbbq 24(%rdi), %rbp + sbbq %r8, %r8 + shldq $0x01, %rbp, %r8 + imulq $-19, %r8 + andq %r9, %rbp # Add modulus (if underflow) - addq %rcx, %r12 - adcq %rdx, %r13 - adcq %rdx, %r14 - adcq %rax, %r15 + subq %r8, %r14 + sbbq $0x00, %r15 + sbbq $0x00, %rbx + sbbq $0x00, %rbp + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) + movq %r14, (%rsi) + movq %r15, 8(%rsi) + movq %rbx, 16(%rsi) + movq %rbp, 24(%rsi) + addq $24, %rsp + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size ge_sub_avx2,.-ge_sub_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl sc_reduce_avx2 +.type sc_reduce_avx2,@function +.align 16 +sc_reduce_avx2: +#else +.section __TEXT,__text +.globl _sc_reduce_avx2 +.p2align 4 +_sc_reduce_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + movq (%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + movq 32(%rdi), %r12 + movq 40(%rdi), %r13 + movq 48(%rdi), %r14 + movq 56(%rdi), %r15 + movq %r15, %rax + movq $0xfffffffffffffff, %rcx + shrq $56, %rax + shldq $4, %r14, %r15 + shldq $4, %r13, %r14 + shldq $4, %r12, %r13 + shldq $4, %r11, %r12 + andq %rcx, %r11 + andq %rcx, %r15 + # Add order times bits 504..511 + subq %rax, %r14 + sbbq $0x00, %r15 + movq $0xeb2106215d086329, %rdx + mulxq %rax, %rsi, %rcx + movq $0xa7ed9ce5a30a2c13, %rdx + addq %rsi, %r13 + mulxq %rax, %rsi, %rbx + adcq $0x00, %rcx + addq %rsi, %r12 + adcq %rbx, %r13 + adcq %rcx, %r14 + adcq $0x00, %r15 + # Sub product of top 4 words and order + movq $0xa7ed9ce5a30a2c13, %rdx + mulx %r12, %rcx, %rax + addq %rcx, %r8 + adcq %rax, %r9 + mulx %r14, %rcx, %rax + adcq %rcx, %r10 + adcq %rax, %r11 + mov $0x00, %rsi + adcq $0x00, %rsi + mulx %r13, %rcx, %rax + addq %rcx, %r9 + adcq %rax, %r10 + mulx %r15, %rcx, %rax + adcq %rcx, %r11 + adcq %rax, %rsi + movq $0xeb2106215d086329, %rdx + mulx %r12, %rcx, %rax + addq %rcx, %r9 + adcq %rax, %r10 + mulx %r14, %rcx, %rax + adcq %rcx, %r11 + adcq %rax, %rsi + mov $0x00, %rbx + adcq $0x00, %rbx + mulx %r13, %rcx, %rax + addq %rcx, %r10 + adcq %rax, %r11 + mulx %r15, %rcx, %rax + adcq %rcx, %rsi + adcq %rax, %rbx + subq %r12, %r10 + movq %rsi, %r12 + sbbq %r13, %r11 + movq %rbx, %r13 + sbbq %r14, %r12 + sbbq %r15, %r13 + movq %r13, %rax + sarq $57, %rax + # Conditionally subtract order starting at bit 125 + movq $0xa000000000000000, %rsi + movq $0xcb024c634b9eba7d, %rbx + movq $0x29bdf3bd45ef39a, %rbp + movq $0x200000000000000, %rcx + andq %rax, %rsi + andq %rax, %rbx + andq %rax, %rbp + andq %rax, %rcx + addq %rsi, %r9 + adcq %rbx, %r10 + adcq %rbp, %r11 + adcq $0x00, %r12 + adcq %rcx, %r13 + # Move bits 252-376 to own registers + movq $0xfffffffffffffff, %rax + shldq $4, %r12, %r13 + shldq $4, %r11, %r12 + andq %rax, %r11 + # Sub product of top 2 words and order + # * -5812631a5cf5d3ed + movq $0xa7ed9ce5a30a2c13, %rdx + mulx %r12, %rbp, %rax + movq $0x00, %rsi + addq %rbp, %r8 + adcq %rax, %r9 + mulx %r13, %rbp, %rax + adcq $0x00, %rsi + addq %rbp, %r9 + adcq %rax, %rsi + # * -14def9dea2f79cd7 + movq $0xeb2106215d086329, %rdx + mulx %r12, %rbp, %rax + movq $0x00, %rbx + addq %rbp, %r9 + adcq %rax, %r10 + mulx %r13, %rbp, %rax + adcq $0x00, %rbx + addq %rbp, %r10 + adcq %rax, %rbx + # Add overflows at 2 * 64 + movq $0xfffffffffffffff, %rcx + andq %rcx, %r11 + addq %rsi, %r10 + adcq %rbx, %r11 + # Subtract top at 2 * 64 + subq %r12, %r10 + sbbq %r13, %r11 + sbbq %rcx, %rcx + # Conditional sub order + movq $0x5812631a5cf5d3ed, %rsi + movq $0x14def9dea2f79cd6, %rbx + movq $0x1000000000000000, %rbp + andq %rcx, %rsi + andq %rcx, %rbx + andq %rcx, %rbp + addq %rsi, %r8 + movq $0xfffffffffffffff, %rsi + adcq %rbx, %r9 + adcq $0x00, %r10 + adcq %rbp, %r11 + andq %rsi, %r11 + # Store result movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) - movq %r12, (%rbp) - movq %r13, 8(%rbp) - movq %r14, 16(%rbp) - movq %r15, 24(%rbp) - addq $0x50, %rsp + popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 + repz retq +#ifndef __APPLE__ +.size sc_reduce_avx2,.-sc_reduce_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl sc_muladd_avx2 +.type sc_muladd_avx2,@function +.align 16 +sc_muladd_avx2: +#else +.section __TEXT,__text +.globl _sc_muladd_avx2 +.p2align 4 +_sc_muladd_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + movq %rdx, %r8 + movq %rcx, %r9 + # Multiply + # A[0] * B[0] + movq (%r8), %rdx + mulxq (%rsi), %r10, %r11 + # A[2] * B[0] + mulxq 16(%rsi), %r12, %r13 + # A[1] * B[0] + mulxq 8(%rsi), %rax, %rcx + xorq %rbp, %rbp + adcxq %rax, %r11 + # A[3] * B[1] + movq 8(%r8), %rdx + mulxq 24(%rsi), %r14, %r15 + adcxq %rcx, %r12 + # A[0] * B[1] + mulxq (%rsi), %rax, %rcx + adoxq %rax, %r11 + # A[2] * B[1] + mulxq 16(%rsi), %rax, %rbx + adoxq %rcx, %r12 + adcxq %rax, %r13 + # A[1] * B[2] + movq 16(%r8), %rdx + mulxq 8(%rsi), %rax, %rcx + adcxq %rbx, %r14 + adoxq %rax, %r13 + adcxq %rbp, %r15 + adoxq %rcx, %r14 + # A[0] * B[2] + mulxq (%rsi), %rax, %rcx + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rax, %r12 + # A[1] * B[1] + movq 8(%r8), %rdx + mulxq 8(%rsi), %rdx, %rax + adcxq %rcx, %r13 + adoxq %rdx, %r12 + # A[1] * B[3] + movq 24(%r8), %rdx + adoxq %rax, %r13 + mulxq 8(%rsi), %rax, %rcx + adcxq %rax, %r14 + # A[2] * B[2] + movq 16(%r8), %rdx + mulxq 16(%rsi), %rdx, %rax + adcxq %rcx, %r15 + adoxq %rdx, %r14 + # A[3] * B[3] + movq 24(%r8), %rdx + adoxq %rax, %r15 + mulxq 24(%rsi), %rax, %rcx + adoxq %rbp, %rbx + adcxq %rax, %rbx + # A[0] * B[3] + mulxq (%rsi), %rdx, %rax + adcxq %rcx, %rbp + xorq %rcx, %rcx + adcxq %rdx, %r13 + # A[3] * B[0] + movq 24(%rsi), %rdx + adcxq %rax, %r14 + mulxq (%r8), %rdx, %rax + adoxq %rdx, %r13 + adoxq %rax, %r14 + # A[3] * B[2] + movq 24(%rsi), %rdx + mulxq 16(%r8), %rdx, %rax + adcxq %rdx, %r15 + # A[2] * B[3] + movq 24(%r8), %rdx + adcxq %rax, %rbx + mulxq 16(%rsi), %rax, %rdx + adcxq %rcx, %rbp + adoxq %rax, %r15 + adoxq %rdx, %rbx + adoxq %rcx, %rbp + # Add c to a * b + addq (%r9), %r10 + adcq 8(%r9), %r11 + adcq 16(%r9), %r12 + adcq 24(%r9), %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + adcq $0x00, %rbx + adcq $0x00, %rbp + movq %rbp, %rax + movq $0xfffffffffffffff, %rcx + shrq $56, %rax + shldq $4, %rbx, %rbp + shldq $4, %r15, %rbx + shldq $4, %r14, %r15 + shldq $4, %r13, %r14 + andq %rcx, %r13 + andq %rcx, %rbp + # Add order times bits 504..507 + subq %rax, %rbx + sbbq $0x00, %rbp + movq $0xeb2106215d086329, %rdx + mulxq %rax, %rsi, %rcx + movq $0xa7ed9ce5a30a2c13, %rdx + addq %rsi, %r15 + mulxq %rax, %rsi, %r8 + adcq $0x00, %rcx + addq %rsi, %r14 + adcq %r8, %r15 + adcq %rcx, %rbx + adcq $0x00, %rbp + # Sub product of top 4 words and order + movq $0xa7ed9ce5a30a2c13, %rdx + mulx %r14, %rcx, %rax + addq %rcx, %r10 + adcq %rax, %r11 + mulx %rbx, %rcx, %rax + adcq %rcx, %r12 + adcq %rax, %r13 + mov $0x00, %rsi + adcq $0x00, %rsi + mulx %r15, %rcx, %rax + addq %rcx, %r11 + adcq %rax, %r12 + mulx %rbp, %rcx, %rax + adcq %rcx, %r13 + adcq %rax, %rsi + movq $0xeb2106215d086329, %rdx + mulx %r14, %rcx, %rax + addq %rcx, %r11 + adcq %rax, %r12 + mulx %rbx, %rcx, %rax + adcq %rcx, %r13 + adcq %rax, %rsi + mov $0x00, %r8 + adcq $0x00, %r8 + mulx %r15, %rcx, %rax + addq %rcx, %r12 + adcq %rax, %r13 + mulx %rbp, %rcx, %rax + adcq %rcx, %rsi + adcq %rax, %r8 + subq %r14, %r12 + movq %rsi, %r14 + sbbq %r15, %r13 + movq %r8, %r15 + sbbq %rbx, %r14 + sbbq %rbp, %r15 + movq %r15, %rax + sarq $57, %rax + # Conditionally subtract order starting at bit 125 + movq $0xa000000000000000, %rsi + movq $0xcb024c634b9eba7d, %r8 + movq $0x29bdf3bd45ef39a, %r9 + movq $0x200000000000000, %rcx + andq %rax, %rsi + andq %rax, %r8 + andq %rax, %r9 + andq %rax, %rcx + addq %rsi, %r11 + adcq %r8, %r12 + adcq %r9, %r13 + adcq $0x00, %r14 + adcq %rcx, %r15 + # Move bits 252-376 to own registers + movq $0xfffffffffffffff, %rax + shldq $4, %r14, %r15 + shldq $4, %r13, %r14 + andq %rax, %r13 + # Sub product of top 2 words and order + # * -5812631a5cf5d3ed + movq $0xa7ed9ce5a30a2c13, %rdx + mulx %r14, %r9, %rax + movq $0x00, %rsi + addq %r9, %r10 + adcq %rax, %r11 + mulx %r15, %r9, %rax + adcq $0x00, %rsi + addq %r9, %r11 + adcq %rax, %rsi + # * -14def9dea2f79cd7 + movq $0xeb2106215d086329, %rdx + mulx %r14, %r9, %rax + movq $0x00, %r8 + addq %r9, %r11 + adcq %rax, %r12 + mulx %r15, %r9, %rax + adcq $0x00, %r8 + addq %r9, %r12 + adcq %rax, %r8 + # Add overflows at 2 * 64 + movq $0xfffffffffffffff, %rcx + andq %rcx, %r13 + addq %rsi, %r12 + adcq %r8, %r13 + # Subtract top at 2 * 64 + subq %r14, %r12 + sbbq %r15, %r13 + sbbq %rcx, %rcx + # Conditional sub order + movq $0x5812631a5cf5d3ed, %rsi + movq $0x14def9dea2f79cd6, %r8 + movq $0x1000000000000000, %r9 + andq %rcx, %rsi + andq %rcx, %r8 + andq %rcx, %r9 + addq %rsi, %r10 + movq $0xfffffffffffffff, %rsi + adcq %r8, %r11 + adcq $0x00, %r12 + adcq %r9, %r13 + andq %rsi, %r13 + # Store result + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) popq %rbp popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 repz retq #ifndef __APPLE__ -.size fe_ge_sub_avx2,.-fe_ge_sub_avx2 +.size sc_muladd_avx2,.-sc_muladd_avx2 #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ #endif /* HAVE_INTEL_AVX2 */ #if defined(__linux__) && defined(__ELF__) diff --git a/wolfcrypt/src/ge_low_mem.c b/wolfcrypt/src/ge_low_mem.c index c4fa510c89..abe6ea697a 100644 --- a/wolfcrypt/src/ge_low_mem.c +++ b/wolfcrypt/src/ge_low_mem.c @@ -441,28 +441,6 @@ void ge_scalarmult_base(ge_p3 *R,const unsigned char *nonce) } -/* pack the point h into array s */ -void ge_p3_tobytes(unsigned char *s,const ge_p3 *h) -{ - byte x[F25519_SIZE]; - byte y[F25519_SIZE]; - byte z1[F25519_SIZE]; - byte parity; - - fe_inv__distinct(z1, h->Z); - fe_mul__distinct(x, h->X, z1); - fe_mul__distinct(y, h->Y, z1); - - fe_normalize(x); - fe_normalize(y); - - parity = (x[0] & 1) << 7; - lm_copy(s, y); - fe_normalize(s); - s[31] |= parity; -} - - /* pack the point h into array s */ void ge_tobytes(unsigned char *s,const ge_p2 *h) { diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c index 7f6c7d7264..39a709db2d 100644 --- a/wolfcrypt/src/ge_operations.c +++ b/wolfcrypt/src/ge_operations.c @@ -58,19 +58,23 @@ static void ge_p2_0(ge_p2 *h); #ifndef CURVED25519_ASM +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) static void ge_precomp_0(ge_precomp *h); +#endif static void ge_p3_to_p2(ge_p2 *r,const ge_p3 *p); #endif static WC_INLINE void ge_p3_to_cached(ge_cached *r,const ge_p3 *p); + +#ifndef CURVED25519_ASM static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p); static WC_INLINE void ge_p1p1_to_p3(ge_p3 *r,const ge_p1p1 *p); static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p); static void ge_p3_dbl(ge_p1p1 *r,const ge_p3 *p); - static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q); static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q); static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q); static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q); +#endif /* ge means group element. @@ -95,28 +99,6 @@ where d = -121665/121666. #define ORDER_4 0x1dea2f #define ORDER_5 0xa6f7c -#ifdef CURVED25519_ASM_32BIT -word64 load_3(const unsigned char *in) -{ - word64 result; - result = (word64) in[0]; - result |= ((word64) in[1]) << 8; - result |= ((word64) in[2]) << 16; - return result; -} - - -word64 load_4(const unsigned char *in) -{ - word64 result; - result = (word64) in[0]; - result |= ((word64) in[1]) << 8; - result |= ((word64) in[2]) << 16; - result |= ((word64) in[3]) << 24; - return result; -} -#endif - /* Input: s[0]+256*s[1]+...+256^63*s[63] = s @@ -126,6 +108,7 @@ word64 load_4(const unsigned char *in) where l = 2^252 + 27742317777372353535851937790883648493. Overwrites s in place. */ +#ifndef CURVED25519_ASM void sc_reduce(byte* s) { sword64 t[24]; @@ -638,7 +621,9 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) s[30] = (byte)(t[11] >> 9); s[31] = (byte)(t[11] >> 17); } +#endif #else +#ifndef CURVED25519_ASM static word64 load_6(const byte* a) { word64 n; @@ -929,26 +914,20 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) s[30] = (byte)(t[ 5] >> 30); s[31] = (byte)(t[ 5] >> 38); } +#endif /* !CURVED25519_ASM */ #endif /* !HAVE___UINT128_T || NO_CURVED25519_128BIT */ int ge_compress_key(byte* out, const byte* xIn, const byte* yIn, word32 keySz) { - ge x,y,z; - ge_p3 g; + ge_p2 g; byte bArray[ED25519_KEY_SIZE]; word32 i; - fe_0(x); - fe_0(y); - fe_1(z); - fe_frombytes(x, xIn); - fe_frombytes(y, yIn); + fe_frombytes(g.X, xIn); + fe_frombytes(g.Y, yIn); + fe_1(g.Z); - fe_copy(g.X, x); - fe_copy(g.Y, y); - fe_copy(g.Z, z); - - ge_p3_tobytes(bArray, &g); + ge_tobytes(bArray, &g); for (i = 0; i < keySz; i++) { out[keySz - 1 - i] = bArray[i]; @@ -961,9 +940,9 @@ int ge_compress_key(byte* out, const byte* xIn, const byte* yIn, word32 keySz) /* r = p + q */ +#ifndef CURVED25519_ASM static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) { -#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -976,11 +955,8 @@ static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) fe_add(r->Y,r->Z,r->Y); fe_add(r->Z,t0,r->T); fe_sub(r->T,t0,r->T); -#else - fe_ge_add(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->Z, q->T2d, - q->YplusX, q->YminusX); -#endif } +#endif #ifndef CURVED25519_ASM @@ -994,11 +970,12 @@ static unsigned char equal(unsigned char b,unsigned char c) return (unsigned char)y; } - +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) static unsigned char negative(signed char b) { return ((unsigned char)b) >> 7; } +#endif static WC_INLINE void cmov(ge_precomp *t,const ge_precomp *u,unsigned char b, @@ -1011,6 +988,7 @@ static WC_INLINE void cmov(ge_precomp *t,const ge_precomp *u,unsigned char b, } #endif +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) #ifdef CURVED25519_ASM_64BIT static const ge_precomp base[64][8] = { { @@ -9188,6 +9166,7 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a) } #endif } +#endif /* HAVE_ED25519_SIGN || HAVE_ED25519_MAKE_KEY */ #define SLIDE_SIZE 256 @@ -9596,9 +9575,9 @@ int ge_frombytes_negate_vartime(ge_p3 *h,const unsigned char *s) r = p + q */ +#ifndef CURVED25519_ASM static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) { -#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -9610,11 +9589,8 @@ static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) fe_add(r->Y,r->Z,r->Y); fe_add(r->Z,t0,r->T); fe_sub(r->T,t0,r->T); -#else - fe_ge_madd(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->xy2d, - q->yplusx, q->yminusx); -#endif } +#endif /* ge msub */ @@ -9623,9 +9599,9 @@ static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) r = p - q */ +#ifndef CURVED25519_ASM static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) { -#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -9637,11 +9613,8 @@ static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) fe_add(r->Y,r->Z,r->Y); fe_sub(r->Z,t0,r->T); fe_add(r->T,t0,r->T); -#else - fe_ge_msub(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->xy2d, - q->yplusx, q->yminusx); -#endif } +#endif /* ge p1p1 to p2 */ @@ -9649,16 +9622,14 @@ static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q) r = p */ +#ifndef CURVED25519_ASM static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p) { -#ifndef CURVED25519_ASM fe_mul(r->X,p->X,p->T); fe_mul(r->Y,p->Y,p->Z); fe_mul(r->Z,p->Z,p->T); -#else - fe_ge_to_p2(r->X, r->Y, r->Z, p->X, p->Y, p->Z, p->T); -#endif } +#endif /* ge p1p1 to p3 */ @@ -9667,17 +9638,15 @@ static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p) r = p */ +#ifndef CURVED25519_ASM static WC_INLINE void ge_p1p1_to_p3(ge_p3 *r,const ge_p1p1 *p) { -#ifndef CURVED25519_ASM fe_mul(r->X,p->X,p->T); fe_mul(r->Y,p->Y,p->Z); fe_mul(r->Z,p->Z,p->T); fe_mul(r->T,p->X,p->Y); -#else - fe_ge_to_p3(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T); -#endif } +#endif /* ge p2 0 */ @@ -9696,9 +9665,9 @@ static void ge_p2_0(ge_p2 *h) r = 2 * p */ +#ifndef CURVED25519_ASM static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p) { -#ifndef CURVED25519_ASM ge t0; fe_sq(r->X,p->X); fe_sq(r->Z,p->Y); @@ -9709,10 +9678,8 @@ static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p) fe_sub(r->Z,r->Z,r->X); fe_sub(r->X,t0,r->Y); fe_sub(r->T,r->T,r->Z); -#else - fe_ge_dbl(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z); -#endif } +#endif /* ge p3 dble */ @@ -9721,16 +9688,14 @@ static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p) r = 2 * p */ +#ifndef CURVED25519_ASM static void ge_p3_dbl(ge_p1p1 *r,const ge_p3 *p) { -#ifndef CURVED25519_ASM ge_p2 q; ge_p3_to_p2(&q,p); ge_p2_dbl(r,&q); -#else - fe_ge_dbl(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z); -#endif } +#endif /* ge p3 to cached */ @@ -9784,6 +9749,7 @@ static void ge_p3_to_p2(ge_p2 *r,const ge_p3 *p) #endif +#ifdef GE_P3_TOBYTES_IMPL /* ge p3 tobytes */ void ge_p3_tobytes(unsigned char *s,const ge_p3 *h) { @@ -9797,9 +9763,11 @@ void ge_p3_tobytes(unsigned char *s,const ge_p3 *h) fe_tobytes(s,y); s[31] ^= (unsigned char)(fe_isnegative(x) << 7); } +#endif #ifndef CURVED25519_ASM +#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY) /* ge_precomp_0 */ static void ge_precomp_0(ge_precomp *h) { @@ -9808,6 +9776,7 @@ static void ge_precomp_0(ge_precomp *h) fe_0(h->xy2d); } #endif +#endif /* ge_sub */ @@ -9815,9 +9784,9 @@ static void ge_precomp_0(ge_precomp *h) r = p - q */ +#ifndef CURVED25519_ASM static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) { -#ifndef CURVED25519_ASM ge t0; fe_add(r->X,p->Y,p->X); fe_sub(r->Y,p->Y,p->X); @@ -9830,12 +9799,8 @@ static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q) fe_add(r->Y,r->Z,r->Y); fe_sub(r->Z,t0,r->T); fe_add(r->T,t0,r->T); -#else - fe_ge_sub(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->Z, q->T2d, - q->YplusX, q->YminusX); -#endif } - +#endif /* ge tobytes */ void ge_tobytes(unsigned char *s,const ge_p2 *h) diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S index b6d71723ef..ccfbdeb7ed 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S @@ -30,273 +30,14 @@ #include #ifdef WOLFSSL_ARMASM -#ifndef __aarch64__ - .text - .type L_AES_ARM32_te, %object - .size L_AES_ARM32_te, 1024 - .align 4 -L_AES_ARM32_te: - .word 0xa5c66363 - .word 0x84f87c7c - .word 0x99ee7777 - .word 0x8df67b7b - .word 0xdfff2f2 - .word 0xbdd66b6b - .word 0xb1de6f6f - .word 0x5491c5c5 - .word 0x50603030 - .word 0x3020101 - .word 0xa9ce6767 - .word 0x7d562b2b - .word 0x19e7fefe - .word 0x62b5d7d7 - .word 0xe64dabab - .word 0x9aec7676 - .word 0x458fcaca - .word 0x9d1f8282 - .word 0x4089c9c9 - .word 0x87fa7d7d - .word 0x15effafa - .word 0xebb25959 - .word 0xc98e4747 - .word 0xbfbf0f0 - .word 0xec41adad - .word 0x67b3d4d4 - .word 0xfd5fa2a2 - .word 0xea45afaf - .word 0xbf239c9c - .word 0xf753a4a4 - .word 0x96e47272 - .word 0x5b9bc0c0 - .word 0xc275b7b7 - .word 0x1ce1fdfd - .word 0xae3d9393 - .word 0x6a4c2626 - .word 0x5a6c3636 - .word 0x417e3f3f - .word 0x2f5f7f7 - .word 0x4f83cccc - .word 0x5c683434 - .word 0xf451a5a5 - .word 0x34d1e5e5 - .word 0x8f9f1f1 - .word 0x93e27171 - .word 0x73abd8d8 - .word 0x53623131 - .word 0x3f2a1515 - .word 0xc080404 - .word 0x5295c7c7 - .word 0x65462323 - .word 0x5e9dc3c3 - .word 0x28301818 - .word 0xa1379696 - .word 0xf0a0505 - .word 0xb52f9a9a - .word 0x90e0707 - .word 0x36241212 - .word 0x9b1b8080 - .word 0x3ddfe2e2 - .word 0x26cdebeb - .word 0x694e2727 - .word 0xcd7fb2b2 - .word 0x9fea7575 - .word 0x1b120909 - .word 0x9e1d8383 - .word 0x74582c2c - .word 0x2e341a1a - .word 0x2d361b1b - .word 0xb2dc6e6e - .word 0xeeb45a5a - .word 0xfb5ba0a0 - .word 0xf6a45252 - .word 0x4d763b3b - .word 0x61b7d6d6 - .word 0xce7db3b3 - .word 0x7b522929 - .word 0x3edde3e3 - .word 0x715e2f2f - .word 0x97138484 - .word 0xf5a65353 - .word 0x68b9d1d1 - .word 0x0 - .word 0x2cc1eded - .word 0x60402020 - .word 0x1fe3fcfc - .word 0xc879b1b1 - .word 0xedb65b5b - .word 0xbed46a6a - .word 0x468dcbcb - .word 0xd967bebe - .word 0x4b723939 - .word 0xde944a4a - .word 0xd4984c4c - .word 0xe8b05858 - .word 0x4a85cfcf - .word 0x6bbbd0d0 - .word 0x2ac5efef - .word 0xe54faaaa - .word 0x16edfbfb - .word 0xc5864343 - .word 0xd79a4d4d - .word 0x55663333 - .word 0x94118585 - .word 0xcf8a4545 - .word 0x10e9f9f9 - .word 0x6040202 - .word 0x81fe7f7f - .word 0xf0a05050 - .word 0x44783c3c - .word 0xba259f9f - .word 0xe34ba8a8 - .word 0xf3a25151 - .word 0xfe5da3a3 - .word 0xc0804040 - .word 0x8a058f8f - .word 0xad3f9292 - .word 0xbc219d9d - .word 0x48703838 - .word 0x4f1f5f5 - .word 0xdf63bcbc - .word 0xc177b6b6 - .word 0x75afdada - .word 0x63422121 - .word 0x30201010 - .word 0x1ae5ffff - .word 0xefdf3f3 - .word 0x6dbfd2d2 - .word 0x4c81cdcd - .word 0x14180c0c - .word 0x35261313 - .word 0x2fc3ecec - .word 0xe1be5f5f - .word 0xa2359797 - .word 0xcc884444 - .word 0x392e1717 - .word 0x5793c4c4 - .word 0xf255a7a7 - .word 0x82fc7e7e - .word 0x477a3d3d - .word 0xacc86464 - .word 0xe7ba5d5d - .word 0x2b321919 - .word 0x95e67373 - .word 0xa0c06060 - .word 0x98198181 - .word 0xd19e4f4f - .word 0x7fa3dcdc - .word 0x66442222 - .word 0x7e542a2a - .word 0xab3b9090 - .word 0x830b8888 - .word 0xca8c4646 - .word 0x29c7eeee - .word 0xd36bb8b8 - .word 0x3c281414 - .word 0x79a7dede - .word 0xe2bc5e5e - .word 0x1d160b0b - .word 0x76addbdb - .word 0x3bdbe0e0 - .word 0x56643232 - .word 0x4e743a3a - .word 0x1e140a0a - .word 0xdb924949 - .word 0xa0c0606 - .word 0x6c482424 - .word 0xe4b85c5c - .word 0x5d9fc2c2 - .word 0x6ebdd3d3 - .word 0xef43acac - .word 0xa6c46262 - .word 0xa8399191 - .word 0xa4319595 - .word 0x37d3e4e4 - .word 0x8bf27979 - .word 0x32d5e7e7 - .word 0x438bc8c8 - .word 0x596e3737 - .word 0xb7da6d6d - .word 0x8c018d8d - .word 0x64b1d5d5 - .word 0xd29c4e4e - .word 0xe049a9a9 - .word 0xb4d86c6c - .word 0xfaac5656 - .word 0x7f3f4f4 - .word 0x25cfeaea - .word 0xafca6565 - .word 0x8ef47a7a - .word 0xe947aeae - .word 0x18100808 - .word 0xd56fbaba - .word 0x88f07878 - .word 0x6f4a2525 - .word 0x725c2e2e - .word 0x24381c1c - .word 0xf157a6a6 - .word 0xc773b4b4 - .word 0x5197c6c6 - .word 0x23cbe8e8 - .word 0x7ca1dddd - .word 0x9ce87474 - .word 0x213e1f1f - .word 0xdd964b4b - .word 0xdc61bdbd - .word 0x860d8b8b - .word 0x850f8a8a - .word 0x90e07070 - .word 0x427c3e3e - .word 0xc471b5b5 - .word 0xaacc6666 - .word 0xd8904848 - .word 0x5060303 - .word 0x1f7f6f6 - .word 0x121c0e0e - .word 0xa3c26161 - .word 0x5f6a3535 - .word 0xf9ae5757 - .word 0xd069b9b9 - .word 0x91178686 - .word 0x5899c1c1 - .word 0x273a1d1d - .word 0xb9279e9e - .word 0x38d9e1e1 - .word 0x13ebf8f8 - .word 0xb32b9898 - .word 0x33221111 - .word 0xbbd26969 - .word 0x70a9d9d9 - .word 0x89078e8e - .word 0xa7339494 - .word 0xb62d9b9b - .word 0x223c1e1e - .word 0x92158787 - .word 0x20c9e9e9 - .word 0x4987cece - .word 0xffaa5555 - .word 0x78502828 - .word 0x7aa5dfdf - .word 0x8f038c8c - .word 0xf859a1a1 - .word 0x80098989 - .word 0x171a0d0d - .word 0xda65bfbf - .word 0x31d7e6e6 - .word 0xc6844242 - .word 0xb8d06868 - .word 0xc3824141 - .word 0xb0299999 - .word 0x775a2d2d - .word 0x111e0f0f - .word 0xcb7bb0b0 - .word 0xfca85454 - .word 0xd66dbbbb - .word 0x3a2c1616 +#if !defined(__aarch64__) && defined(__arm__) +#ifndef WOLFSSL_ARMASM_INLINE +#ifndef NO_AES .text - .type L_AES_ARM32_td, %object - .size L_AES_ARM32_td, 1024 + .type L_AES_ARM32_td_data, %object + .size L_AES_ARM32_td_data, 1024 .align 4 -L_AES_ARM32_td: +L_AES_ARM32_td_data: .word 0x5051f4a7 .word 0x537e4165 .word 0xc31a17a4 @@ -553,12 +294,377 @@ L_AES_ARM32_td: .word 0x70d532b6 .word 0x74486c5c .word 0x42d0b857 -#ifndef NO_AES .text - .type L_AES_SEK_ARM32_rcon, %object - .size L_AES_SEK_ARM32_rcon, 40 + .type L_AES_ARM32_te_data, %object + .size L_AES_ARM32_te_data, 1024 + .align 4 +L_AES_ARM32_te_data: + .word 0xa5c66363 + .word 0x84f87c7c + .word 0x99ee7777 + .word 0x8df67b7b + .word 0xdfff2f2 + .word 0xbdd66b6b + .word 0xb1de6f6f + .word 0x5491c5c5 + .word 0x50603030 + .word 0x3020101 + .word 0xa9ce6767 + .word 0x7d562b2b + .word 0x19e7fefe + .word 0x62b5d7d7 + .word 0xe64dabab + .word 0x9aec7676 + .word 0x458fcaca + .word 0x9d1f8282 + .word 0x4089c9c9 + .word 0x87fa7d7d + .word 0x15effafa + .word 0xebb25959 + .word 0xc98e4747 + .word 0xbfbf0f0 + .word 0xec41adad + .word 0x67b3d4d4 + .word 0xfd5fa2a2 + .word 0xea45afaf + .word 0xbf239c9c + .word 0xf753a4a4 + .word 0x96e47272 + .word 0x5b9bc0c0 + .word 0xc275b7b7 + .word 0x1ce1fdfd + .word 0xae3d9393 + .word 0x6a4c2626 + .word 0x5a6c3636 + .word 0x417e3f3f + .word 0x2f5f7f7 + .word 0x4f83cccc + .word 0x5c683434 + .word 0xf451a5a5 + .word 0x34d1e5e5 + .word 0x8f9f1f1 + .word 0x93e27171 + .word 0x73abd8d8 + .word 0x53623131 + .word 0x3f2a1515 + .word 0xc080404 + .word 0x5295c7c7 + .word 0x65462323 + .word 0x5e9dc3c3 + .word 0x28301818 + .word 0xa1379696 + .word 0xf0a0505 + .word 0xb52f9a9a + .word 0x90e0707 + .word 0x36241212 + .word 0x9b1b8080 + .word 0x3ddfe2e2 + .word 0x26cdebeb + .word 0x694e2727 + .word 0xcd7fb2b2 + .word 0x9fea7575 + .word 0x1b120909 + .word 0x9e1d8383 + .word 0x74582c2c + .word 0x2e341a1a + .word 0x2d361b1b + .word 0xb2dc6e6e + .word 0xeeb45a5a + .word 0xfb5ba0a0 + .word 0xf6a45252 + .word 0x4d763b3b + .word 0x61b7d6d6 + .word 0xce7db3b3 + .word 0x7b522929 + .word 0x3edde3e3 + .word 0x715e2f2f + .word 0x97138484 + .word 0xf5a65353 + .word 0x68b9d1d1 + .word 0x0 + .word 0x2cc1eded + .word 0x60402020 + .word 0x1fe3fcfc + .word 0xc879b1b1 + .word 0xedb65b5b + .word 0xbed46a6a + .word 0x468dcbcb + .word 0xd967bebe + .word 0x4b723939 + .word 0xde944a4a + .word 0xd4984c4c + .word 0xe8b05858 + .word 0x4a85cfcf + .word 0x6bbbd0d0 + .word 0x2ac5efef + .word 0xe54faaaa + .word 0x16edfbfb + .word 0xc5864343 + .word 0xd79a4d4d + .word 0x55663333 + .word 0x94118585 + .word 0xcf8a4545 + .word 0x10e9f9f9 + .word 0x6040202 + .word 0x81fe7f7f + .word 0xf0a05050 + .word 0x44783c3c + .word 0xba259f9f + .word 0xe34ba8a8 + .word 0xf3a25151 + .word 0xfe5da3a3 + .word 0xc0804040 + .word 0x8a058f8f + .word 0xad3f9292 + .word 0xbc219d9d + .word 0x48703838 + .word 0x4f1f5f5 + .word 0xdf63bcbc + .word 0xc177b6b6 + .word 0x75afdada + .word 0x63422121 + .word 0x30201010 + .word 0x1ae5ffff + .word 0xefdf3f3 + .word 0x6dbfd2d2 + .word 0x4c81cdcd + .word 0x14180c0c + .word 0x35261313 + .word 0x2fc3ecec + .word 0xe1be5f5f + .word 0xa2359797 + .word 0xcc884444 + .word 0x392e1717 + .word 0x5793c4c4 + .word 0xf255a7a7 + .word 0x82fc7e7e + .word 0x477a3d3d + .word 0xacc86464 + .word 0xe7ba5d5d + .word 0x2b321919 + .word 0x95e67373 + .word 0xa0c06060 + .word 0x98198181 + .word 0xd19e4f4f + .word 0x7fa3dcdc + .word 0x66442222 + .word 0x7e542a2a + .word 0xab3b9090 + .word 0x830b8888 + .word 0xca8c4646 + .word 0x29c7eeee + .word 0xd36bb8b8 + .word 0x3c281414 + .word 0x79a7dede + .word 0xe2bc5e5e + .word 0x1d160b0b + .word 0x76addbdb + .word 0x3bdbe0e0 + .word 0x56643232 + .word 0x4e743a3a + .word 0x1e140a0a + .word 0xdb924949 + .word 0xa0c0606 + .word 0x6c482424 + .word 0xe4b85c5c + .word 0x5d9fc2c2 + .word 0x6ebdd3d3 + .word 0xef43acac + .word 0xa6c46262 + .word 0xa8399191 + .word 0xa4319595 + .word 0x37d3e4e4 + .word 0x8bf27979 + .word 0x32d5e7e7 + .word 0x438bc8c8 + .word 0x596e3737 + .word 0xb7da6d6d + .word 0x8c018d8d + .word 0x64b1d5d5 + .word 0xd29c4e4e + .word 0xe049a9a9 + .word 0xb4d86c6c + .word 0xfaac5656 + .word 0x7f3f4f4 + .word 0x25cfeaea + .word 0xafca6565 + .word 0x8ef47a7a + .word 0xe947aeae + .word 0x18100808 + .word 0xd56fbaba + .word 0x88f07878 + .word 0x6f4a2525 + .word 0x725c2e2e + .word 0x24381c1c + .word 0xf157a6a6 + .word 0xc773b4b4 + .word 0x5197c6c6 + .word 0x23cbe8e8 + .word 0x7ca1dddd + .word 0x9ce87474 + .word 0x213e1f1f + .word 0xdd964b4b + .word 0xdc61bdbd + .word 0x860d8b8b + .word 0x850f8a8a + .word 0x90e07070 + .word 0x427c3e3e + .word 0xc471b5b5 + .word 0xaacc6666 + .word 0xd8904848 + .word 0x5060303 + .word 0x1f7f6f6 + .word 0x121c0e0e + .word 0xa3c26161 + .word 0x5f6a3535 + .word 0xf9ae5757 + .word 0xd069b9b9 + .word 0x91178686 + .word 0x5899c1c1 + .word 0x273a1d1d + .word 0xb9279e9e + .word 0x38d9e1e1 + .word 0x13ebf8f8 + .word 0xb32b9898 + .word 0x33221111 + .word 0xbbd26969 + .word 0x70a9d9d9 + .word 0x89078e8e + .word 0xa7339494 + .word 0xb62d9b9b + .word 0x223c1e1e + .word 0x92158787 + .word 0x20c9e9e9 + .word 0x4987cece + .word 0xffaa5555 + .word 0x78502828 + .word 0x7aa5dfdf + .word 0x8f038c8c + .word 0xf859a1a1 + .word 0x80098989 + .word 0x171a0d0d + .word 0xda65bfbf + .word 0x31d7e6e6 + .word 0xc6844242 + .word 0xb8d06868 + .word 0xc3824141 + .word 0xb0299999 + .word 0x775a2d2d + .word 0x111e0f0f + .word 0xcb7bb0b0 + .word 0xfca85454 + .word 0xd66dbbbb + .word 0x3a2c1616 + .text + .type L_AES_ARM32_td, %object + .size L_AES_ARM32_td, 12 + .align 4 +L_AES_ARM32_td: + .word L_AES_ARM32_td_data + .text + .type L_AES_ARM32_te, %object + .size L_AES_ARM32_te, 12 + .align 4 +L_AES_ARM32_te: + .word L_AES_ARM32_te_data +#ifdef HAVE_AES_DECRYPT + .text + .align 4 + .globl AES_invert_key + .type AES_invert_key, %function +AES_invert_key: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ldr r12, L_AES_ARM32_te + ldr lr, L_AES_ARM32_td + add r10, r0, r1, lsl #4 + mov r11, r1 +L_AES_invert_key_loop: + ldm r0, {r2, r3, r4, r5} + ldm r10, {r6, r7, r8, r9} + stm r10, {r2, r3, r4, r5} + stm r0!, {r6, r7, r8, r9} + subs r11, r11, #2 + sub r10, r10, #16 + bne L_AES_invert_key_loop + sub r0, r0, r1, lsl #3 + add r0, r0, #16 + sub r11, r1, #1 +L_AES_invert_key_mix_loop: + ldm r0, {r2, r3, r4, r5} + ubfx r6, r2, #0, #8 + ubfx r7, r2, #8, #8 + ubfx r8, r2, #16, #8 + lsr r9, r2, #24 + ldrb r6, [r12, r6, lsl #2] + ldrb r7, [r12, r7, lsl #2] + ldrb r8, [r12, r8, lsl #2] + ldrb r9, [r12, r9, lsl #2] + ldr r6, [lr, r6, lsl #2] + ldr r7, [lr, r7, lsl #2] + ldr r8, [lr, r8, lsl #2] + ldr r9, [lr, r9, lsl #2] + eor r8, r8, r6, ror #16 + eor r8, r8, r7, ror #8 + eor r8, r8, r9, ror #24 + str r8, [r0], #4 + ubfx r6, r3, #0, #8 + ubfx r7, r3, #8, #8 + ubfx r8, r3, #16, #8 + lsr r9, r3, #24 + ldrb r6, [r12, r6, lsl #2] + ldrb r7, [r12, r7, lsl #2] + ldrb r8, [r12, r8, lsl #2] + ldrb r9, [r12, r9, lsl #2] + ldr r6, [lr, r6, lsl #2] + ldr r7, [lr, r7, lsl #2] + ldr r8, [lr, r8, lsl #2] + ldr r9, [lr, r9, lsl #2] + eor r8, r8, r6, ror #16 + eor r8, r8, r7, ror #8 + eor r8, r8, r9, ror #24 + str r8, [r0], #4 + ubfx r6, r4, #0, #8 + ubfx r7, r4, #8, #8 + ubfx r8, r4, #16, #8 + lsr r9, r4, #24 + ldrb r6, [r12, r6, lsl #2] + ldrb r7, [r12, r7, lsl #2] + ldrb r8, [r12, r8, lsl #2] + ldrb r9, [r12, r9, lsl #2] + ldr r6, [lr, r6, lsl #2] + ldr r7, [lr, r7, lsl #2] + ldr r8, [lr, r8, lsl #2] + ldr r9, [lr, r9, lsl #2] + eor r8, r8, r6, ror #16 + eor r8, r8, r7, ror #8 + eor r8, r8, r9, ror #24 + str r8, [r0], #4 + ubfx r6, r5, #0, #8 + ubfx r7, r5, #8, #8 + ubfx r8, r5, #16, #8 + lsr r9, r5, #24 + ldrb r6, [r12, r6, lsl #2] + ldrb r7, [r12, r7, lsl #2] + ldrb r8, [r12, r8, lsl #2] + ldrb r9, [r12, r9, lsl #2] + ldr r6, [lr, r6, lsl #2] + ldr r7, [lr, r7, lsl #2] + ldr r8, [lr, r8, lsl #2] + ldr r9, [lr, r9, lsl #2] + eor r8, r8, r6, ror #16 + eor r8, r8, r7, ror #8 + eor r8, r8, r9, ror #24 + str r8, [r0], #4 + subs r11, r11, #1 + bne L_AES_invert_key_mix_loop + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size AES_invert_key,.-AES_invert_key +#endif /* HAVE_AES_DECRYPT */ + .text + .type L_AES_ARM32_rcon, %object + .size L_AES_ARM32_rcon, 40 .align 4 -L_AES_SEK_ARM32_rcon: +L_AES_ARM32_rcon: .word 0x1000000 .word 0x2000000 .word 0x4000000 @@ -575,10 +681,8 @@ L_AES_SEK_ARM32_rcon: .type AES_set_encrypt_key, %function AES_set_encrypt_key: push {r4, r5, r6, r7, r8, lr} - adr r4, AES_set_encrypt_key - mov r8, #AES_set_encrypt_key-L_AES_ARM32_te - sub r8, r4, r8 - adr lr, L_AES_SEK_ARM32_rcon + ldr r8, L_AES_ARM32_te + adr lr, L_AES_ARM32_rcon cmp r1, #0x80 beq L_AES_set_encrypt_key_start_128 cmp r1, #0xc0 @@ -620,17 +724,15 @@ AES_set_encrypt_key: sub r2, r2, #16 mov r12, #6 L_AES_set_encrypt_key_loop_256: - mov r3, r7 - mov r7, #0xff - and r5, r7, r3, lsr #8 - and r4, r7, r3 - and r6, r7, r3, lsr #16 - lsr r3, r3, #24 + ubfx r4, r7, #0, #8 + ubfx r5, r7, #8, #8 + ubfx r6, r7, #16, #8 + lsr r7, r7, #24 ldrb r4, [r8, r4, lsl #2] - ldrb r6, [r8, r6, lsl #2] ldrb r5, [r8, r5, lsl #2] - ldrb r3, [r8, r3, lsl #2] - eor r3, r3, r4, lsl #8 + ldrb r6, [r8, r6, lsl #2] + ldrb r7, [r8, r7, lsl #2] + eor r3, r7, r4, lsl #8 eor r3, r3, r5, lsl #16 eor r3, r3, r6, lsl #24 ldm r2!, {r4, r5, r6, r7} @@ -644,11 +746,10 @@ L_AES_set_encrypt_key_loop_256: stm r2, {r4, r5, r6, r7} sub r2, r2, #16 mov r3, r7 - mov r7, #0xff - and r4, r7, r3, lsr #8 - and r5, r7, r3, lsr #16 + ubfx r4, r3, #8, #8 + ubfx r5, r3, #16, #8 lsr r6, r3, #24 - and r3, r7, r3 + ubfx r3, r3, #0, #8 ldrb r4, [r8, r4, lsl #2] ldrb r6, [r8, r6, lsl #2] ldrb r5, [r8, r5, lsl #2] @@ -666,17 +767,15 @@ L_AES_set_encrypt_key_loop_256: sub r2, r2, #16 subs r12, r12, #1 bne L_AES_set_encrypt_key_loop_256 - mov r3, r7 - mov r7, #0xff - and r5, r7, r3, lsr #8 - and r4, r7, r3 - and r6, r7, r3, lsr #16 - lsr r3, r3, #24 + ubfx r4, r7, #0, #8 + ubfx r5, r7, #8, #8 + ubfx r6, r7, #16, #8 + lsr r7, r7, #24 ldrb r4, [r8, r4, lsl #2] - ldrb r6, [r8, r6, lsl #2] ldrb r5, [r8, r5, lsl #2] - ldrb r3, [r8, r3, lsl #2] - eor r3, r3, r4, lsl #8 + ldrb r6, [r8, r6, lsl #2] + ldrb r7, [r8, r7, lsl #2] + eor r3, r7, r4, lsl #8 eor r3, r3, r5, lsl #16 eor r3, r3, r6, lsl #24 ldm r2!, {r4, r5, r6, r7} @@ -725,17 +824,15 @@ L_AES_set_encrypt_key_start_192: mov r7, r1 mov r12, #7 L_AES_set_encrypt_key_loop_192: - mov r3, r7 - mov r5, #0xff - and r1, r5, r3, lsr #8 - and r0, r5, r3 - and r4, r5, r3, lsr #16 - lsr r3, r3, #24 + ubfx r0, r7, #0, #8 + ubfx r1, r7, #8, #8 + ubfx r4, r7, #16, #8 + lsr r7, r7, #24 ldrb r0, [r8, r0, lsl #2] - ldrb r4, [r8, r4, lsl #2] ldrb r1, [r8, r1, lsl #2] - ldrb r3, [r8, r3, lsl #2] - eor r3, r3, r0, lsl #8 + ldrb r4, [r8, r4, lsl #2] + ldrb r7, [r8, r7, lsl #2] + eor r3, r7, r0, lsl #8 eor r3, r3, r1, lsl #16 eor r3, r3, r4, lsl #24 ldm r2!, {r0, r1, r4, r5, r6, r7} @@ -750,17 +847,15 @@ L_AES_set_encrypt_key_loop_192: stm r2, {r0, r1, r4, r5, r6, r7} subs r12, r12, #1 bne L_AES_set_encrypt_key_loop_192 - mov r3, r7 - mov r5, #0xff - and r1, r5, r3, lsr #8 - and r0, r5, r3 - and r4, r5, r3, lsr #16 - lsr r3, r3, #24 + ubfx r0, r7, #0, #8 + ubfx r1, r7, #8, #8 + ubfx r4, r7, #16, #8 + lsr r7, r7, #24 ldrb r0, [r8, r0, lsl #2] - ldrb r4, [r8, r4, lsl #2] ldrb r1, [r8, r1, lsl #2] - ldrb r3, [r8, r3, lsl #2] - eor r3, r3, r0, lsl #8 + ldrb r4, [r8, r4, lsl #2] + ldrb r7, [r8, r7, lsl #2] + eor r3, r7, r0, lsl #8 eor r3, r3, r1, lsl #16 eor r3, r3, r4, lsl #24 ldm r2!, {r0, r1, r4, r5, r6, r7} @@ -792,17 +887,15 @@ L_AES_set_encrypt_key_start_128: stm r2, {r4, r5, r6, r7} mov r12, #10 L_AES_set_encrypt_key_loop_128: - mov r3, r7 - mov r7, #0xff - and r5, r7, r3, lsr #8 - and r4, r7, r3 - and r6, r7, r3, lsr #16 - lsr r3, r3, #24 + ubfx r4, r7, #0, #8 + ubfx r5, r7, #8, #8 + ubfx r6, r7, #16, #8 + lsr r7, r7, #24 ldrb r4, [r8, r4, lsl #2] - ldrb r6, [r8, r6, lsl #2] ldrb r5, [r8, r5, lsl #2] - ldrb r3, [r8, r3, lsl #2] - eor r3, r3, r4, lsl #8 + ldrb r6, [r8, r6, lsl #2] + ldrb r7, [r8, r7, lsl #2] + eor r3, r7, r4, lsl #8 eor r3, r3, r5, lsl #16 eor r3, r3, r6, lsl #24 ldm r2!, {r4, r5, r6, r7} @@ -818,331 +911,224 @@ L_AES_set_encrypt_key_loop_128: L_AES_set_encrypt_key_end: pop {r4, r5, r6, r7, r8, pc} .size AES_set_encrypt_key,.-AES_set_encrypt_key -#ifdef HAVE_AES_DECRYPT - .text - .align 4 - .globl AES_invert_key - .type AES_invert_key, %function -AES_invert_key: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - adr r4, AES_invert_key - mov r9, #AES_invert_key-L_AES_ARM32_te - mov r10, #AES_invert_key-L_AES_ARM32_td - sub r9, r4, r9 - sub r10, r4, r10 - add r8, r0, r1, lsl #4 - mov r11, r1 -L_AES_invert_key_loop: - ldm r0, {r2, r3, r12, lr} - ldm r8, {r4, r5, r6, r7} - stm r8, {r2, r3, r12, lr} - stm r0!, {r4, r5, r6, r7} - subs r11, r11, #2 - sub r8, r8, #16 - bne L_AES_invert_key_loop - sub r0, r0, r1, lsl #3 - add r0, r0, #16 - sub r11, r1, #1 - mov r1, #0xff -L_AES_invert_key_mix_loop: - ldm r0, {r2, r3, r12, lr} - lsr r4, r2, #24 - and r7, r1, r2 - and r8, r1, r2, lsr #16 - and r6, r1, r2, lsr #8 - ldrb r4, [r9, r4, lsl #2] - ldrb r7, [r9, r7, lsl #2] - ldrb r8, [r9, r8, lsl #2] - ldrb r6, [r9, r6, lsl #2] - ldr r4, [r10, r4, lsl #2] - ldr r7, [r10, r7, lsl #2] - ldr r8, [r10, r8, lsl #2] - ldr r6, [r10, r6, lsl #2] - eor r8, r8, r4, ror #24 - eor r8, r8, r7, ror #16 - eor r8, r8, r6, ror #8 - str r8, [r0], #4 - lsr r4, r3, #24 - and r7, r1, r3 - and r8, r1, r3, lsr #16 - and r6, r1, r3, lsr #8 - ldrb r4, [r9, r4, lsl #2] - ldrb r7, [r9, r7, lsl #2] - ldrb r8, [r9, r8, lsl #2] - ldrb r6, [r9, r6, lsl #2] - ldr r4, [r10, r4, lsl #2] - ldr r7, [r10, r7, lsl #2] - ldr r8, [r10, r8, lsl #2] - ldr r6, [r10, r6, lsl #2] - eor r8, r8, r4, ror #24 - eor r8, r8, r7, ror #16 - eor r8, r8, r6, ror #8 - str r8, [r0], #4 - lsr r4, r12, #24 - and r7, r1, r12 - and r8, r1, r12, lsr #16 - and r6, r1, r12, lsr #8 - ldrb r4, [r9, r4, lsl #2] - ldrb r7, [r9, r7, lsl #2] - ldrb r8, [r9, r8, lsl #2] - ldrb r6, [r9, r6, lsl #2] - ldr r4, [r10, r4, lsl #2] - ldr r7, [r10, r7, lsl #2] - ldr r8, [r10, r8, lsl #2] - ldr r6, [r10, r6, lsl #2] - eor r8, r8, r4, ror #24 - eor r8, r8, r7, ror #16 - eor r8, r8, r6, ror #8 - str r8, [r0], #4 - lsr r4, lr, #24 - and r7, r1, lr - and r8, r1, lr, lsr #16 - and r6, r1, lr, lsr #8 - ldrb r4, [r9, r4, lsl #2] - ldrb r7, [r9, r7, lsl #2] - ldrb r8, [r9, r8, lsl #2] - ldrb r6, [r9, r6, lsl #2] - ldr r4, [r10, r4, lsl #2] - ldr r7, [r10, r7, lsl #2] - ldr r8, [r10, r8, lsl #2] - ldr r6, [r10, r6, lsl #2] - eor r8, r8, r4, ror #24 - eor r8, r8, r7, ror #16 - eor r8, r8, r6, ror #8 - str r8, [r0], #4 - subs r11, r11, #1 - bne L_AES_invert_key_mix_loop - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size AES_invert_key,.-AES_invert_key -#endif /* HAVE_AES_DECRYPT */ #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) .text .align 4 .globl AES_encrypt_block .type AES_encrypt_block, %function AES_encrypt_block: -L_AES_encrypt_block_14: - push {r2, lr} - ldr lr, [sp, #12] - mov r2, #6 - b L_AES_encrypt_block_nr -L_AES_encrypt_block_12: - push {r2, lr} - ldr lr, [sp, #12] - mov r2, #5 - b L_AES_encrypt_block_nr -L_AES_encrypt_block_10: - push {r2, lr} - ldr lr, [sp, #12] - mov r2, #4 + push {lr} L_AES_encrypt_block_nr: + ubfx r8, r5, #16, #8 lsr r11, r4, #24 - and r1, r12, r7 - and r8, r12, r5, lsr #16 - and r0, r12, r6, lsr #8 - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - ldr r8, [lr, r8, lsl #2] - ldr r0, [lr, r0, lsl #2] - and r9, r12, r6, lsr #16 + ubfx lr, r6, #8, #8 + ubfx r2, r7, #0, #8 + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r9, r6, #16, #8 eor r8, r8, r11, ror #24 lsr r11, r5, #24 - eor r8, r8, r0, ror #8 - and r0, r12, r7, lsr #8 - eor r8, r8, r1, ror #16 - and r1, r12, r4 - ldr r9, [lr, r9, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r10, r12, r7, lsr #16 + eor r8, r8, lr, ror #8 + ubfx lr, r7, #8, #8 + eor r8, r8, r2, ror #16 + ubfx r2, r4, #0, #8 + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r10, r7, #16, #8 eor r9, r9, r11, ror #24 lsr r11, r6, #24 - eor r9, r9, r0, ror #8 - and r0, r12, r4, lsr #8 - eor r9, r9, r1, ror #16 - and r1, r12, r5 - ldr r10, [lr, r10, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r6, r12, r6 + eor r9, r9, lr, ror #8 + ubfx lr, r4, #8, #8 + eor r9, r9, r2, ror #16 + ubfx r2, r5, #0, #8 + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r6, r6, #0, #8 eor r10, r10, r11, ror #24 - and r11, r12, r4, lsr #16 - eor r10, r10, r0, ror #8 - lsr r0, r7, #24 - eor r10, r10, r1, ror #16 - and r1, r12, r5, lsr #8 - ldr r6, [lr, r6, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - eor r0, r0, r6, ror #24 + ubfx r11, r4, #16, #8 + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 + ubfx r2, r5, #8, #8 + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 ldm r3!, {r4, r5, r6, r7} - eor r11, r11, r0, ror #24 - eor r11, r11, r1, ror #8 + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 # XOR in Key Schedule eor r8, r8, r4 eor r9, r9, r5 eor r10, r10, r6 eor r11, r11, r7 + ubfx r4, r9, #16, #8 lsr r7, r8, #24 - and r1, r12, r11 - and r4, r12, r9, lsr #16 - and r0, r12, r10, lsr #8 - ldr r7, [lr, r7, lsl #2] - ldr r1, [lr, r1, lsl #2] - ldr r4, [lr, r4, lsl #2] - ldr r0, [lr, r0, lsl #2] - and r5, r12, r10, lsr #16 + ubfx lr, r10, #8, #8 + ubfx r2, r11, #0, #8 + ldr r4, [r0, r4, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r5, r10, #16, #8 eor r4, r4, r7, ror #24 lsr r7, r9, #24 - eor r4, r4, r0, ror #8 - and r0, r12, r11, lsr #8 - eor r4, r4, r1, ror #16 - and r1, r12, r8 - ldr r5, [lr, r5, lsl #2] - ldr r7, [lr, r7, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r6, r12, r11, lsr #16 + eor r4, r4, lr, ror #8 + ubfx lr, r11, #8, #8 + eor r4, r4, r2, ror #16 + ubfx r2, r8, #0, #8 + ldr r5, [r0, r5, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r6, r11, #16, #8 eor r5, r5, r7, ror #24 lsr r7, r10, #24 - eor r5, r5, r0, ror #8 - and r0, r12, r8, lsr #8 - eor r5, r5, r1, ror #16 - and r1, r12, r9 - ldr r6, [lr, r6, lsl #2] - ldr r7, [lr, r7, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r10, r12, r10 + eor r5, r5, lr, ror #8 + ubfx lr, r8, #8, #8 + eor r5, r5, r2, ror #16 + ubfx r2, r9, #0, #8 + ldr r6, [r0, r6, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r10, r10, #0, #8 eor r6, r6, r7, ror #24 - and r7, r12, r8, lsr #16 - eor r6, r6, r0, ror #8 - lsr r0, r11, #24 - eor r6, r6, r1, ror #16 - and r1, r12, r9, lsr #8 - ldr r10, [lr, r10, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r7, [lr, r7, lsl #2] - ldr r1, [lr, r1, lsl #2] - eor r0, r0, r10, ror #24 + ubfx r7, r8, #16, #8 + eor r6, r6, lr, ror #8 + lsr lr, r11, #24 + eor r6, r6, r2, ror #16 + ubfx r2, r9, #8, #8 + ldr r10, [r0, r10, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r10, ror #24 ldm r3!, {r8, r9, r10, r11} - eor r7, r7, r0, ror #24 - eor r7, r7, r1, ror #8 + eor r7, r7, lr, ror #24 + eor r7, r7, r2, ror #8 # XOR in Key Schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - subs r2, r2, #1 + subs r1, r1, #1 bne L_AES_encrypt_block_nr + ubfx r8, r5, #16, #8 lsr r11, r4, #24 - and r1, r12, r7 - and r8, r12, r5, lsr #16 - and r0, r12, r6, lsr #8 - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - ldr r8, [lr, r8, lsl #2] - ldr r0, [lr, r0, lsl #2] - and r9, r12, r6, lsr #16 + ubfx lr, r6, #8, #8 + ubfx r2, r7, #0, #8 + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r9, r6, #16, #8 eor r8, r8, r11, ror #24 lsr r11, r5, #24 - eor r8, r8, r0, ror #8 - and r0, r12, r7, lsr #8 - eor r8, r8, r1, ror #16 - and r1, r12, r4 - ldr r9, [lr, r9, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r10, r12, r7, lsr #16 + eor r8, r8, lr, ror #8 + ubfx lr, r7, #8, #8 + eor r8, r8, r2, ror #16 + ubfx r2, r4, #0, #8 + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r10, r7, #16, #8 eor r9, r9, r11, ror #24 lsr r11, r6, #24 - eor r9, r9, r0, ror #8 - and r0, r12, r4, lsr #8 - eor r9, r9, r1, ror #16 - and r1, r12, r5 - ldr r10, [lr, r10, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r6, r12, r6 + eor r9, r9, lr, ror #8 + ubfx lr, r4, #8, #8 + eor r9, r9, r2, ror #16 + ubfx r2, r5, #0, #8 + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r6, r6, #0, #8 eor r10, r10, r11, ror #24 - and r11, r12, r4, lsr #16 - eor r10, r10, r0, ror #8 - lsr r0, r7, #24 - eor r10, r10, r1, ror #16 - and r1, r12, r5, lsr #8 - ldr r6, [lr, r6, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - eor r0, r0, r6, ror #24 + ubfx r11, r4, #16, #8 + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 + ubfx r2, r5, #8, #8 + ldr r6, [r0, r6, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r6, ror #24 ldm r3!, {r4, r5, r6, r7} - eor r11, r11, r0, ror #24 - eor r11, r11, r1, ror #8 + eor r11, r11, lr, ror #24 + eor r11, r11, r2, ror #8 # XOR in Key Schedule eor r8, r8, r4 eor r9, r9, r5 eor r10, r10, r6 eor r11, r11, r7 - and r7, r12, r10, lsr #8 - lsr r1, r8, #24 - and r4, r12, r11 - and r0, r12, r9, lsr #16 - ldrb r7, [lr, r7, lsl #2] - ldrb r1, [lr, r1, lsl #2] - ldrb r4, [lr, r4, lsl #2] - ldrb r0, [lr, r0, lsl #2] - and r5, r12, r8 + ubfx r4, r11, #0, #8 + ubfx r7, r10, #8, #8 + ubfx lr, r9, #16, #8 + lsr r2, r8, #24 + ldrb r4, [r0, r4, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + ubfx r5, r8, #0, #8 eor r4, r4, r7, lsl #8 - and r7, r12, r11, lsr #8 - eor r4, r4, r0, lsl #16 - and r0, r12, r10, lsr #16 - eor r4, r4, r1, lsl #24 - lsr r1, r9, #24 - ldrb r7, [lr, r7, lsl #2] - ldrb r1, [lr, r1, lsl #2] - ldrb r5, [lr, r5, lsl #2] - ldrb r0, [lr, r0, lsl #2] - and r6, r12, r9 + ubfx r7, r11, #8, #8 + eor r4, r4, lr, lsl #16 + ubfx lr, r10, #16, #8 + eor r4, r4, r2, lsl #24 + lsr r2, r9, #24 + ldrb r5, [r0, r5, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + ubfx r6, r9, #0, #8 eor r5, r5, r7, lsl #8 - and r7, r12, r8, lsr #8 - eor r5, r5, r0, lsl #16 - and r0, r12, r11, lsr #16 - eor r5, r5, r1, lsl #24 - lsr r1, r10, #24 - ldrb r7, [lr, r7, lsl #2] - ldrb r1, [lr, r1, lsl #2] - ldrb r6, [lr, r6, lsl #2] - ldrb r0, [lr, r0, lsl #2] + ubfx r7, r8, #8, #8 + eor r5, r5, lr, lsl #16 + ubfx lr, r11, #16, #8 + eor r5, r5, r2, lsl #24 + lsr r2, r10, #24 + ldrb r6, [r0, r6, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] lsr r11, r11, #24 eor r6, r6, r7, lsl #8 - and r7, r12, r10 - eor r6, r6, r0, lsl #16 - and r0, r12, r9, lsr #8 - eor r6, r6, r1, lsl #24 - and r1, r12, r8, lsr #16 - ldrb r11, [lr, r11, lsl #2] - ldrb r0, [lr, r0, lsl #2] - ldrb r7, [lr, r7, lsl #2] - ldrb r1, [lr, r1, lsl #2] - eor r0, r0, r11, lsl #16 + ubfx r7, r10, #0, #8 + eor r6, r6, lr, lsl #16 + ubfx lr, r9, #8, #8 + eor r6, r6, r2, lsl #24 + ubfx r2, r8, #16, #8 + ldrb r11, [r0, r11, lsl #2] + ldrb r7, [r0, r7, lsl #2] + ldrb lr, [r0, lr, lsl #2] + ldrb r2, [r0, r2, lsl #2] + eor lr, lr, r11, lsl #16 ldm r3, {r8, r9, r10, r11} - eor r7, r7, r0, lsl #8 - eor r7, r7, r1, lsl #16 + eor r7, r7, lr, lsl #8 + eor r7, r7, r2, lsl #16 # XOR in Key Schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - str lr, [sp, #12] - pop {r2, lr} - bx lr - bx lr + pop {pc} .size AES_encrypt_block,.-AES_encrypt_block + .text + .type L_AES_ARM32_te_ecb, %object + .size L_AES_ARM32_te_ecb, 12 + .align 4 +L_AES_ARM32_te_ecb: + .word L_AES_ARM32_te_data #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) .text .align 4 @@ -1150,35 +1136,34 @@ L_AES_encrypt_block_nr: .type AES_ECB_encrypt, %function AES_ECB_encrypt: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - adr r4, AES_ECB_encrypt - mov lr, #AES_ECB_encrypt-L_AES_ARM32_te - sub lr, r4, lr + mov lr, r0 + ldr r0, L_AES_ARM32_te_ecb ldr r12, [sp, #36] + push {r3} cmp r12, #10 beq L_AES_ECB_encrypt_start_block_128 cmp r12, #12 beq L_AES_ECB_encrypt_start_block_192 - mov r12, #0xff L_AES_ECB_encrypt_loop_block_256: - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - push {r0, r1} - push {r3, lr} + push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_14 - pop {r3, lr} - pop {r0, r1} + mov r1, #6 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 @@ -1188,32 +1173,31 @@ L_AES_ECB_encrypt_loop_block_256: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_ECB_encrypt_loop_block_256 b L_AES_ECB_encrypt_end L_AES_ECB_encrypt_start_block_192: - mov r12, #0xff L_AES_ECB_encrypt_loop_block_192: - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - push {r0, r1} - push {r3, lr} + push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_12 - pop {r3, lr} - pop {r0, r1} + mov r1, #5 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 @@ -1223,32 +1207,31 @@ L_AES_ECB_encrypt_loop_block_192: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_ECB_encrypt_loop_block_192 b L_AES_ECB_encrypt_end L_AES_ECB_encrypt_start_block_128: - mov r12, #0xff L_AES_ECB_encrypt_loop_block_128: - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - push {r0, r1} - push {r3, lr} + push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_10 - pop {r3, lr} - pop {r0, r1} + mov r1, #4 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 @@ -1258,10 +1241,11 @@ L_AES_ECB_encrypt_loop_block_128: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_ECB_encrypt_loop_block_128 L_AES_ECB_encrypt_end: + pop {r3} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size AES_ECB_encrypt,.-AES_ECB_encrypt #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ @@ -1272,29 +1256,26 @@ L_AES_ECB_encrypt_end: .type AES_CBC_encrypt, %function AES_CBC_encrypt: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - ldr r12, [sp, #36] - ldr lr, [sp, #40] - ldm lr, {r4, r5, r6, r7} - push {lr} - adr r8, AES_CBC_encrypt - mov lr, #AES_CBC_encrypt-L_AES_ARM32_te - sub lr, r8, lr - cmp r12, #10 + ldr r8, [sp, #36] + ldr r9, [sp, #40] + mov lr, r0 + ldr r0, L_AES_ARM32_te_ecb + ldm r9, {r4, r5, r6, r7} + push {r3, r9} + cmp r8, #10 beq L_AES_CBC_encrypt_start_block_128 - cmp r12, #12 + cmp r8, #12 beq L_AES_CBC_encrypt_start_block_192 - mov r12, #0xff L_AES_CBC_encrypt_loop_block_256: - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - push {r0, r1} - push {r3, lr} + push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -1305,9 +1286,10 @@ L_AES_CBC_encrypt_loop_block_256: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_14 - pop {r3, lr} - pop {r0, r1} + mov r1, #6 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 @@ -1317,23 +1299,21 @@ L_AES_CBC_encrypt_loop_block_256: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CBC_encrypt_loop_block_256 b L_AES_CBC_encrypt_end L_AES_CBC_encrypt_start_block_192: - mov r12, #0xff L_AES_CBC_encrypt_loop_block_192: - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - push {r0, r1} - push {r3, lr} + push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -1344,9 +1324,10 @@ L_AES_CBC_encrypt_loop_block_192: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_12 - pop {r3, lr} - pop {r0, r1} + mov r1, #5 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 @@ -1356,23 +1337,21 @@ L_AES_CBC_encrypt_loop_block_192: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CBC_encrypt_loop_block_192 b L_AES_CBC_encrypt_end L_AES_CBC_encrypt_start_block_128: - mov r12, #0xff L_AES_CBC_encrypt_loop_block_128: - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - push {r0, r1} - push {r3, lr} + push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -1383,9 +1362,10 @@ L_AES_CBC_encrypt_loop_block_128: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_10 - pop {r3, lr} - pop {r0, r1} + mov r1, #4 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 @@ -1395,12 +1375,12 @@ L_AES_CBC_encrypt_loop_block_128: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CBC_encrypt_loop_block_128 L_AES_CBC_encrypt_end: - pop {lr} - stm lr, {r4, r5, r6, r7} + pop {r3, r9} + stm r9, {r4, r5, r6, r7} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size AES_CBC_encrypt,.-AES_CBC_encrypt #endif /* HAVE_AES_CBC */ @@ -1412,161 +1392,374 @@ L_AES_CBC_encrypt_end: AES_CTR_encrypt: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ldr r12, [sp, #36] - ldr lr, [sp, #40] - ldm lr, {r4, r5, r6, r7} + ldr r8, [sp, #40] + mov lr, r0 + ldr r0, L_AES_ARM32_te_ecb + ldm r8, {r4, r5, r6, r7} rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - stm lr, {r4, r5, r6, r7} - push {lr} - adr r8, AES_CTR_encrypt - mov lr, #AES_CTR_encrypt-L_AES_ARM32_te - sub lr, r8, lr + stm r8, {r4, r5, r6, r7} + push {r3, r8} cmp r12, #10 beq L_AES_CTR_encrypt_start_block_128 cmp r12, #12 beq L_AES_CTR_encrypt_start_block_192 - mov r12, #0xff L_AES_CTR_encrypt_loop_block_256: - push {r0, r1} - ldr r0, [sp, #8] + push {r1, r2, lr} + ldr lr, [sp, #16] adds r11, r7, #1 adcs r10, r6, #0 adcs r9, r5, #0 adc r8, r4, #0 - stm r0, {r8, r9, r10, r11} - push {r3, lr} + stm lr, {r8, r9, r10, r11} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_14 - pop {r3, lr} - pop {r0, r1} + mov r1, #6 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r8 eor r5, r9 eor r6, r10 eor r7, r11 + ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] str r7, [r1, #12] - ldr r8, [sp] ldm r8, {r4, r5, r6, r7} subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CTR_encrypt_loop_block_256 b L_AES_CTR_encrypt_end L_AES_CTR_encrypt_start_block_192: - mov r12, #0xff L_AES_CTR_encrypt_loop_block_192: - push {r0, r1} - ldr r0, [sp, #8] + push {r1, r2, lr} + ldr lr, [sp, #16] adds r11, r7, #1 adcs r10, r6, #0 adcs r9, r5, #0 adc r8, r4, #0 - stm r0, {r8, r9, r10, r11} - push {r3, lr} + stm lr, {r8, r9, r10, r11} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_12 - pop {r3, lr} - pop {r0, r1} + mov r1, #5 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r8 eor r5, r9 eor r6, r10 eor r7, r11 + ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] str r7, [r1, #12] - ldr r8, [sp] ldm r8, {r4, r5, r6, r7} subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CTR_encrypt_loop_block_192 b L_AES_CTR_encrypt_end L_AES_CTR_encrypt_start_block_128: - mov r12, #0xff L_AES_CTR_encrypt_loop_block_128: - push {r0, r1} - ldr r0, [sp, #8] + push {r1, r2, lr} + ldr lr, [sp, #16] adds r11, r7, #1 adcs r10, r6, #0 adcs r9, r5, #0 adc r8, r4, #0 - stm r0, {r8, r9, r10, r11} - push {r3, lr} + stm lr, {r8, r9, r10, r11} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_10 - pop {r3, lr} - pop {r0, r1} - rev r4, r4 - rev r5, r5 - rev r6, r6 - rev r7, r7 - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] - eor r4, r8 - eor r5, r9 - eor r6, r10 - eor r7, r11 - str r4, [r1] - str r5, [r1, #4] - str r6, [r1, #8] - str r7, [r1, #12] - ldr r8, [sp] - ldm r8, {r4, r5, r6, r7} - subs r2, r2, #16 - add r0, r0, #16 - add r1, r1, #16 - bne L_AES_CTR_encrypt_loop_block_128 -L_AES_CTR_encrypt_end: - pop {lr} - rev r4, r4 - rev r5, r5 - rev r6, r6 - rev r7, r7 - stm lr, {r4, r5, r6, r7} - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size AES_CTR_encrypt,.-AES_CTR_encrypt -#endif /* WOLFSSL_AES_COUNTER */ -#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ -#ifdef HAVE_AES_DECRYPT -#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) + mov r1, #4 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] + eor r4, r8 + eor r5, r9 + eor r6, r10 + eor r7, r11 + ldr r8, [sp, #4] + str r4, [r1] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + ldm r8, {r4, r5, r6, r7} + subs r2, r2, #16 + add lr, lr, #16 + add r1, r1, #16 + bne L_AES_CTR_encrypt_loop_block_128 +L_AES_CTR_encrypt_end: + pop {r3, r8} + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + stm r8, {r4, r5, r6, r7} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size AES_CTR_encrypt,.-AES_CTR_encrypt +#endif /* WOLFSSL_AES_COUNTER */ +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) + .text + .align 4 + .globl AES_decrypt_block + .type AES_decrypt_block, %function +AES_decrypt_block: + push {lr} +L_AES_decrypt_block_nr: + ubfx r8, r7, #16, #8 + lsr r11, r4, #24 + ubfx lr, r6, #8, #8 + ubfx r2, r5, #0, #8 + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r9, r4, #16, #8 + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 + ubfx lr, r7, #8, #8 + eor r8, r8, r2, ror #16 + ubfx r2, r6, #0, #8 + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r10, r5, #16, #8 + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 + ubfx lr, r4, #8, #8 + eor r9, r9, r2, ror #16 + ubfx r2, r7, #0, #8 + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r4, r4, #0, #8 + eor r10, r10, r11, ror #24 + ubfx r11, r6, #16, #8 + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 + ubfx r2, r5, #8, #8 + ldr r4, [r0, r4, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r4, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, r2, ror #8 + eor r11, r11, lr, ror #24 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ubfx r4, r11, #16, #8 + lsr r7, r8, #24 + ubfx lr, r10, #8, #8 + ubfx r2, r9, #0, #8 + ldr r4, [r0, r4, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r5, r8, #16, #8 + eor r4, r4, r7, ror #24 + lsr r7, r9, #24 + eor r4, r4, lr, ror #8 + ubfx lr, r11, #8, #8 + eor r4, r4, r2, ror #16 + ubfx r2, r10, #0, #8 + ldr r5, [r0, r5, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r6, r9, #16, #8 + eor r5, r5, r7, ror #24 + lsr r7, r10, #24 + eor r5, r5, lr, ror #8 + ubfx lr, r8, #8, #8 + eor r5, r5, r2, ror #16 + ubfx r2, r11, #0, #8 + ldr r6, [r0, r6, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r8, r8, #0, #8 + eor r6, r6, r7, ror #24 + ubfx r7, r10, #16, #8 + eor r6, r6, lr, ror #8 + lsr lr, r11, #24 + eor r6, r6, r2, ror #16 + ubfx r2, r9, #8, #8 + ldr r8, [r0, r8, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r7, [r0, r7, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r8, ror #24 + ldm r3!, {r8, r9, r10, r11} + eor r7, r7, r2, ror #8 + eor r7, r7, lr, ror #24 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + subs r1, r1, #1 + bne L_AES_decrypt_block_nr + ubfx r8, r7, #16, #8 + lsr r11, r4, #24 + ubfx lr, r6, #8, #8 + ubfx r2, r5, #0, #8 + ldr r8, [r0, r8, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r9, r4, #16, #8 + eor r8, r8, r11, ror #24 + lsr r11, r5, #24 + eor r8, r8, lr, ror #8 + ubfx lr, r7, #8, #8 + eor r8, r8, r2, ror #16 + ubfx r2, r6, #0, #8 + ldr r9, [r0, r9, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r10, r5, #16, #8 + eor r9, r9, r11, ror #24 + lsr r11, r6, #24 + eor r9, r9, lr, ror #8 + ubfx lr, r4, #8, #8 + eor r9, r9, r2, ror #16 + ubfx r2, r7, #0, #8 + ldr r10, [r0, r10, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r2, [r0, r2, lsl #2] + ubfx r4, r4, #0, #8 + eor r10, r10, r11, ror #24 + ubfx r11, r6, #16, #8 + eor r10, r10, lr, ror #8 + lsr lr, r7, #24 + eor r10, r10, r2, ror #16 + ubfx r2, r5, #8, #8 + ldr r4, [r0, r4, lsl #2] + ldr lr, [r0, lr, lsl #2] + ldr r11, [r0, r11, lsl #2] + ldr r2, [r0, r2, lsl #2] + eor lr, lr, r4, ror #24 + ldm r3!, {r4, r5, r6, r7} + eor r11, r11, r2, ror #8 + eor r11, r11, lr, ror #24 + # XOR in Key Schedule + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ubfx r4, r9, #0, #8 + ubfx r7, r10, #8, #8 + ubfx lr, r11, #16, #8 + lsr r2, r8, #24 + ldrb r4, [r12, r4] + ldrb r7, [r12, r7] + ldrb lr, [r12, lr] + ldrb r2, [r12, r2] + ubfx r5, r10, #0, #8 + eor r4, r4, r7, lsl #8 + ubfx r7, r11, #8, #8 + eor r4, r4, lr, lsl #16 + ubfx lr, r8, #16, #8 + eor r4, r4, r2, lsl #24 + lsr r2, r9, #24 + ldrb r7, [r12, r7] + ldrb r2, [r12, r2] + ldrb r5, [r12, r5] + ldrb lr, [r12, lr] + ubfx r6, r11, #0, #8 + eor r5, r5, r7, lsl #8 + ubfx r7, r8, #8, #8 + eor r5, r5, lr, lsl #16 + ubfx lr, r9, #16, #8 + eor r5, r5, r2, lsl #24 + lsr r2, r10, #24 + ldrb r7, [r12, r7] + ldrb r2, [r12, r2] + ldrb r6, [r12, r6] + ldrb lr, [r12, lr] + lsr r11, r11, #24 + eor r6, r6, r7, lsl #8 + ubfx r7, r8, #0, #8 + eor r6, r6, lr, lsl #16 + ubfx lr, r9, #8, #8 + eor r6, r6, r2, lsl #24 + ubfx r2, r10, #16, #8 + ldrb r11, [r12, r11] + ldrb lr, [r12, lr] + ldrb r7, [r12, r7] + ldrb r2, [r12, r2] + eor lr, lr, r11, lsl #16 + ldm r3, {r8, r9, r10, r11} + eor r7, r7, lr, lsl #8 + eor r7, r7, r2, lsl #16 + # XOR in Key Schedule + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + pop {pc} + .size AES_decrypt_block,.-AES_decrypt_block + .text + .type L_AES_ARM32_td_ecb, %object + .size L_AES_ARM32_td_ecb, 12 + .align 4 +L_AES_ARM32_td_ecb: + .word L_AES_ARM32_td_data .text .type L_AES_ARM32_td4, %object .size L_AES_ARM32_td4, 256 @@ -1828,234 +2021,6 @@ L_AES_ARM32_td4: .byte 0x21 .byte 0xc .byte 0x7d - .text - .align 4 - .globl AES_decrypt_block - .type AES_decrypt_block, %function -AES_decrypt_block: -L_AES_decrypt_block_14: - push {lr} - ldr lr, [sp, #8] - mov r12, #6 - b L_AES_decrypt_block_nr -L_AES_decrypt_block_12: - push {lr} - ldr lr, [sp, #8] - mov r12, #5 - b L_AES_decrypt_block_nr -L_AES_decrypt_block_10: - push {lr} - ldr lr, [sp, #8] - mov r12, #4 -L_AES_decrypt_block_nr: - lsr r11, r4, #24 - and r1, r2, r5 - and r8, r2, r7, lsr #16 - and r0, r2, r6, lsr #8 - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - ldr r8, [lr, r8, lsl #2] - ldr r0, [lr, r0, lsl #2] - and r9, r2, r4, lsr #16 - eor r8, r8, r11, ror #24 - lsr r11, r5, #24 - eor r8, r8, r0, ror #8 - and r0, r2, r7, lsr #8 - eor r8, r8, r1, ror #16 - and r1, r2, r6 - ldr r9, [lr, r9, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r10, r2, r5, lsr #16 - eor r9, r9, r11, ror #24 - lsr r11, r6, #24 - eor r9, r9, r0, ror #8 - and r0, r2, r4, lsr #8 - eor r9, r9, r1, ror #16 - and r1, r2, r7 - ldr r10, [lr, r10, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r4, r2, r4 - eor r10, r10, r11, ror #24 - and r11, r2, r6, lsr #16 - eor r10, r10, r0, ror #8 - lsr r0, r7, #24 - eor r10, r10, r1, ror #16 - and r1, r2, r5, lsr #8 - ldr r4, [lr, r4, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - eor r0, r0, r4, ror #24 - ldm r3!, {r4, r5, r6, r7} - eor r11, r11, r1, ror #8 - eor r11, r11, r0, ror #24 - # XOR in Key Schedule - eor r8, r8, r4 - eor r9, r9, r5 - eor r10, r10, r6 - eor r11, r11, r7 - lsr r7, r8, #24 - and r1, r2, r9 - and r4, r2, r11, lsr #16 - and r0, r2, r10, lsr #8 - ldr r7, [lr, r7, lsl #2] - ldr r1, [lr, r1, lsl #2] - ldr r4, [lr, r4, lsl #2] - ldr r0, [lr, r0, lsl #2] - and r5, r2, r8, lsr #16 - eor r4, r4, r7, ror #24 - lsr r7, r9, #24 - eor r4, r4, r0, ror #8 - and r0, r2, r11, lsr #8 - eor r4, r4, r1, ror #16 - and r1, r2, r10 - ldr r5, [lr, r5, lsl #2] - ldr r7, [lr, r7, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r6, r2, r9, lsr #16 - eor r5, r5, r7, ror #24 - lsr r7, r10, #24 - eor r5, r5, r0, ror #8 - and r0, r2, r8, lsr #8 - eor r5, r5, r1, ror #16 - and r1, r2, r11 - ldr r6, [lr, r6, lsl #2] - ldr r7, [lr, r7, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r8, r2, r8 - eor r6, r6, r7, ror #24 - and r7, r2, r10, lsr #16 - eor r6, r6, r0, ror #8 - lsr r0, r11, #24 - eor r6, r6, r1, ror #16 - and r1, r2, r9, lsr #8 - ldr r8, [lr, r8, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r7, [lr, r7, lsl #2] - ldr r1, [lr, r1, lsl #2] - eor r0, r0, r8, ror #24 - ldm r3!, {r8, r9, r10, r11} - eor r7, r7, r1, ror #8 - eor r7, r7, r0, ror #24 - # XOR in Key Schedule - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - subs r12, r12, #1 - bne L_AES_decrypt_block_nr - lsr r11, r4, #24 - and r1, r2, r5 - and r8, r2, r7, lsr #16 - and r0, r2, r6, lsr #8 - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - ldr r8, [lr, r8, lsl #2] - ldr r0, [lr, r0, lsl #2] - and r9, r2, r4, lsr #16 - eor r8, r8, r11, ror #24 - lsr r11, r5, #24 - eor r8, r8, r0, ror #8 - and r0, r2, r7, lsr #8 - eor r8, r8, r1, ror #16 - and r1, r2, r6 - ldr r9, [lr, r9, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r10, r2, r5, lsr #16 - eor r9, r9, r11, ror #24 - lsr r11, r6, #24 - eor r9, r9, r0, ror #8 - and r0, r2, r4, lsr #8 - eor r9, r9, r1, ror #16 - and r1, r2, r7 - ldr r10, [lr, r10, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r1, [lr, r1, lsl #2] - and r4, r2, r4 - eor r10, r10, r11, ror #24 - and r11, r2, r6, lsr #16 - eor r10, r10, r0, ror #8 - lsr r0, r7, #24 - eor r10, r10, r1, ror #16 - and r1, r2, r5, lsr #8 - ldr r4, [lr, r4, lsl #2] - ldr r0, [lr, r0, lsl #2] - ldr r11, [lr, r11, lsl #2] - ldr r1, [lr, r1, lsl #2] - eor r0, r0, r4, ror #24 - ldm r3!, {r4, r5, r6, r7} - eor r11, r11, r1, ror #8 - eor r11, r11, r0, ror #24 - # XOR in Key Schedule - eor r8, r8, r4 - eor r9, r9, r5 - eor r10, r10, r6 - eor r11, r11, r7 - adr r12, L_AES_ARM32_td4 - and r7, r2, r10, lsr #8 - lsr r1, r8, #24 - and r4, r2, r9 - and r0, r2, r11, lsr #16 - ldrb r7, [r12, r7] - ldrb r1, [r12, r1] - ldrb r4, [r12, r4] - ldrb r0, [r12, r0] - and r5, r2, r10 - eor r4, r4, r7, lsl #8 - and r7, r2, r11, lsr #8 - eor r4, r4, r0, lsl #16 - and r0, r2, r8, lsr #16 - eor r4, r4, r1, lsl #24 - lsr r1, r9, #24 - ldrb r7, [r12, r7] - ldrb r1, [r12, r1] - ldrb r5, [r12, r5] - ldrb r0, [r12, r0] - and r6, r2, r11 - eor r5, r5, r7, lsl #8 - and r7, r2, r8, lsr #8 - eor r5, r5, r0, lsl #16 - and r0, r2, r9, lsr #16 - eor r5, r5, r1, lsl #24 - lsr r1, r10, #24 - ldrb r7, [r12, r7] - ldrb r1, [r12, r1] - ldrb r6, [r12, r6] - ldrb r0, [r12, r0] - lsr r11, r11, #24 - eor r6, r6, r7, lsl #8 - and r7, r2, r8 - eor r6, r6, r0, lsl #16 - and r0, r2, r9, lsr #8 - eor r6, r6, r1, lsl #24 - and r1, r2, r10, lsr #16 - ldrb r11, [r12, r11] - ldrb r0, [r12, r0] - ldrb r7, [r12, r7] - ldrb r1, [r12, r1] - eor r0, r0, r11, lsl #16 - ldm r3, {r8, r9, r10, r11} - eor r7, r7, r0, lsl #8 - eor r7, r7, r1, lsl #16 - # XOR in Key Schedule - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - str lr, [sp, #8] - pop {lr} - bx lr - bx lr - .size AES_decrypt_block,.-AES_decrypt_block #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) .text .align 4 @@ -2063,36 +2028,33 @@ L_AES_decrypt_block_nr: .type AES_ECB_decrypt, %function AES_ECB_decrypt: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - ldr r12, [sp, #36] - mov r8, r12 - adr r4, AES_ECB_decrypt - mov lr, #AES_ECB_decrypt-L_AES_ARM32_td - sub lr, r4, lr + ldr r8, [sp, #36] + mov lr, r0 + ldr r0, L_AES_ARM32_td_ecb + adr r12, L_AES_ARM32_td4 cmp r8, #10 beq L_AES_ECB_decrypt_start_block_128 cmp r8, #12 beq L_AES_ECB_decrypt_start_block_192 L_AES_ECB_decrypt_loop_block_256: - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - push {r0, r1, r2} - mov r2, #0xff - push {r3, lr} + push {r1, r2, r3, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_14 - pop {r3, lr} - pop {r0, r1, r2} + mov r1, #6 + bl AES_decrypt_block + pop {r1, r2, r3, lr} rev r4, r4 rev r5, r5 rev r6, r6 @@ -2102,32 +2064,30 @@ L_AES_ECB_decrypt_loop_block_256: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_ECB_decrypt_loop_block_256 b L_AES_ECB_decrypt_end L_AES_ECB_decrypt_start_block_192: L_AES_ECB_decrypt_loop_block_192: - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - push {r0, r1, r2} - mov r2, #0xff - push {r3, lr} + push {r1, r2, r3, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_12 - pop {r3, lr} - pop {r0, r1, r2} + mov r1, #5 + bl AES_decrypt_block + pop {r1, r2, r3, lr} rev r4, r4 rev r5, r5 rev r6, r6 @@ -2137,32 +2097,30 @@ L_AES_ECB_decrypt_loop_block_192: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_ECB_decrypt_loop_block_192 b L_AES_ECB_decrypt_end L_AES_ECB_decrypt_start_block_128: L_AES_ECB_decrypt_loop_block_128: - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - push {r0, r1, r2} - mov r2, #0xff - push {r3, lr} + push {r1, r2, r3, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_10 - pop {r3, lr} - pop {r0, r1, r2} + mov r1, #4 + bl AES_decrypt_block + pop {r1, r2, r3, lr} rev r4, r4 rev r5, r5 rev r6, r6 @@ -2172,7 +2130,7 @@ L_AES_ECB_decrypt_loop_block_128: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_ECB_decrypt_loop_block_128 L_AES_ECB_decrypt_end: @@ -2186,46 +2144,35 @@ L_AES_ECB_decrypt_end: .type AES_CBC_decrypt, %function AES_CBC_decrypt: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #24 - ldr r12, [sp, #60] - ldr lr, [sp, #64] - str lr, [sp, #20] - str r3, [sp] - adr r8, AES_CBC_decrypt - mov lr, #AES_CBC_decrypt-L_AES_ARM32_td - sub lr, r8, lr - mov r8, r12 - str lr, [sp, #4] + ldr r8, [sp, #36] + ldr r4, [sp, #40] + mov lr, r0 + ldr r0, L_AES_ARM32_td_ecb + adr r12, L_AES_ARM32_td4 + push {r3, r4} cmp r8, #10 beq L_AES_CBC_decrypt_loop_block_128 cmp r8, #12 beq L_AES_CBC_decrypt_loop_block_192 L_AES_CBC_decrypt_loop_block_256: + push {r1, r2, lr} + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] + ldr lr, [sp, #16] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r0, [sp, #8] - str r1, [sp, #12] -#else - strd r0, r1, [sp, #8] -#endif - str r2, [sp, #16] - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - ldr r0, [sp, #20] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] + str r4, [lr, #16] + str r5, [lr, #20] #else - strd r4, r5, [r0, #16] + strd r4, r5, [lr, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #24] - str r7, [r0, #28] + str r6, [lr, #24] + str r7, [lr, #28] #else - strd r6, r7, [r0, #24] + strd r6, r7, [lr, #24] #endif - mov r2, #0xff ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -2236,21 +2183,16 @@ L_AES_CBC_decrypt_loop_block_256: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_14 + mov r1, #6 + bl AES_decrypt_block + ldr lr, [sp, #16] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r0, [sp, #20] - ldm r0, {r8, r9, r10, r11} + ldm lr, {r8, r9, r10, r11} + pop {r1, r2, lr} ldr r3, [sp] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r0, [sp, #8] - ldr r1, [sp, #12] -#else - ldrd r0, r1, [sp, #8] -#endif - ldr r2, [sp, #16] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 @@ -2260,34 +2202,27 @@ L_AES_CBC_decrypt_loop_block_256: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 beq L_AES_CBC_decrypt_end_odd + push {r1, r2, lr} + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] + ldr lr, [sp, #16] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r0, [sp, #8] - str r1, [sp, #12] -#else - strd r0, r1, [sp, #8] -#endif - str r2, [sp, #16] - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - ldr r0, [sp, #20] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + str r4, [lr] + str r5, [lr, #4] #else - strd r4, r5, [r0] + strd r4, r5, [lr] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] + str r6, [lr, #8] + str r7, [lr, #12] #else - strd r6, r7, [r0, #8] + strd r6, r7, [lr, #8] #endif - mov r2, #0xff ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -2298,32 +2233,27 @@ L_AES_CBC_decrypt_loop_block_256: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_14 + mov r1, #6 + bl AES_decrypt_block + ldr lr, [sp, #16] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r0, [sp, #20] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r0, #16] - ldr r9, [r0, #20] + ldr r8, [lr, #16] + ldr r9, [lr, #20] #else - ldrd r8, r9, [r0, #16] + ldrd r8, r9, [lr, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r0, #24] - ldr r11, [r0, #28] + ldr r10, [lr, #24] + ldr r11, [lr, #28] #else - ldrd r10, r11, [r0, #24] + ldrd r10, r11, [lr, #24] #endif + pop {r1, r2, lr} ldr r3, [sp] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r0, [sp, #8] - ldr r1, [sp, #12] -#else - ldrd r0, r1, [sp, #8] -#endif - ldr r2, [sp, #16] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 @@ -2333,36 +2263,29 @@ L_AES_CBC_decrypt_loop_block_256: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CBC_decrypt_loop_block_256 b L_AES_CBC_decrypt_end L_AES_CBC_decrypt_loop_block_192: + push {r1, r2, lr} + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] + ldr lr, [sp, #16] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r0, [sp, #8] - str r1, [sp, #12] + str r4, [lr, #16] + str r5, [lr, #20] #else - strd r0, r1, [sp, #8] + strd r4, r5, [lr, #16] #endif - str r2, [sp, #16] - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - ldr r0, [sp, #20] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] + str r6, [lr, #24] + str r7, [lr, #28] #else - strd r4, r5, [r0, #16] + strd r6, r7, [lr, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #24] - str r7, [r0, #28] -#else - strd r6, r7, [r0, #24] -#endif - mov r2, #0xff ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -2373,21 +2296,16 @@ L_AES_CBC_decrypt_loop_block_192: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_12 + mov r1, #5 + bl AES_decrypt_block + ldr lr, [sp, #16] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r0, [sp, #20] - ldm r0, {r8, r9, r10, r11} + ldm lr, {r8, r9, r10, r11} + pop {r1, r2, lr} ldr r3, [sp] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r0, [sp, #8] - ldr r1, [sp, #12] -#else - ldrd r0, r1, [sp, #8] -#endif - ldr r2, [sp, #16] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 @@ -2397,34 +2315,27 @@ L_AES_CBC_decrypt_loop_block_192: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 beq L_AES_CBC_decrypt_end_odd + push {r1, r2, lr} + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] + ldr lr, [sp, #16] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r0, [sp, #8] - str r1, [sp, #12] -#else - strd r0, r1, [sp, #8] -#endif - str r2, [sp, #16] - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - ldr r0, [sp, #20] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + str r4, [lr] + str r5, [lr, #4] #else - strd r4, r5, [r0] + strd r4, r5, [lr] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] + str r6, [lr, #8] + str r7, [lr, #12] #else - strd r6, r7, [r0, #8] + strd r6, r7, [lr, #8] #endif - mov r2, #0xff ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -2435,32 +2346,27 @@ L_AES_CBC_decrypt_loop_block_192: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_12 + mov r1, #5 + bl AES_decrypt_block + ldr lr, [sp, #16] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r0, [sp, #20] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r0, #16] - ldr r9, [r0, #20] + ldr r8, [lr, #16] + ldr r9, [lr, #20] #else - ldrd r8, r9, [r0, #16] + ldrd r8, r9, [lr, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r0, #24] - ldr r11, [r0, #28] + ldr r10, [lr, #24] + ldr r11, [lr, #28] #else - ldrd r10, r11, [r0, #24] + ldrd r10, r11, [lr, #24] #endif + pop {r1, r2, lr} ldr r3, [sp] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r0, [sp, #8] - ldr r1, [sp, #12] -#else - ldrd r0, r1, [sp, #8] -#endif - ldr r2, [sp, #16] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 @@ -2470,36 +2376,29 @@ L_AES_CBC_decrypt_loop_block_192: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CBC_decrypt_loop_block_192 b L_AES_CBC_decrypt_end L_AES_CBC_decrypt_loop_block_128: + push {r1, r2, lr} + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] + ldr lr, [sp, #16] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r0, [sp, #8] - str r1, [sp, #12] -#else - strd r0, r1, [sp, #8] -#endif - str r2, [sp, #16] - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - ldr r0, [sp, #20] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] + str r4, [lr, #16] + str r5, [lr, #20] #else - strd r4, r5, [r0, #16] + strd r4, r5, [lr, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #24] - str r7, [r0, #28] + str r6, [lr, #24] + str r7, [lr, #28] #else - strd r6, r7, [r0, #24] + strd r6, r7, [lr, #24] #endif - mov r2, #0xff ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -2510,21 +2409,16 @@ L_AES_CBC_decrypt_loop_block_128: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_10 + mov r1, #4 + bl AES_decrypt_block + ldr lr, [sp, #16] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r0, [sp, #20] - ldm r0, {r8, r9, r10, r11} + ldm lr, {r8, r9, r10, r11} + pop {r1, r2, lr} ldr r3, [sp] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r0, [sp, #8] - ldr r1, [sp, #12] -#else - ldrd r0, r1, [sp, #8] -#endif - ldr r2, [sp, #16] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 @@ -2534,34 +2428,27 @@ L_AES_CBC_decrypt_loop_block_128: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 beq L_AES_CBC_decrypt_end_odd + push {r1, r2, lr} + ldr r4, [lr] + ldr r5, [lr, #4] + ldr r6, [lr, #8] + ldr r7, [lr, #12] + ldr lr, [sp, #16] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r0, [sp, #8] - str r1, [sp, #12] -#else - strd r0, r1, [sp, #8] -#endif - str r2, [sp, #16] - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - ldr r0, [sp, #20] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + str r4, [lr] + str r5, [lr, #4] #else - strd r4, r5, [r0] + strd r4, r5, [lr] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] + str r6, [lr, #8] + str r7, [lr, #12] #else - strd r6, r7, [r0, #8] + strd r6, r7, [lr, #8] #endif - mov r2, #0xff ldm r3!, {r8, r9, r10, r11} rev r4, r4 rev r5, r5 @@ -2572,32 +2459,27 @@ L_AES_CBC_decrypt_loop_block_128: eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_decrypt_block_10 + mov r1, #4 + bl AES_decrypt_block + ldr lr, [sp, #16] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r0, [sp, #20] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r0, #16] - ldr r9, [r0, #20] + ldr r8, [lr, #16] + ldr r9, [lr, #20] #else - ldrd r8, r9, [r0, #16] + ldrd r8, r9, [lr, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r0, #24] - ldr r11, [r0, #28] + ldr r10, [lr, #24] + ldr r11, [lr, #28] #else - ldrd r10, r11, [r0, #24] + ldrd r10, r11, [lr, #24] #endif + pop {r1, r2, lr} ldr r3, [sp] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r0, [sp, #8] - ldr r1, [sp, #12] -#else - ldrd r0, r1, [sp, #8] -#endif - ldr r2, [sp, #16] eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 @@ -2607,39 +2489,38 @@ L_AES_CBC_decrypt_loop_block_128: str r6, [r1, #8] str r7, [r1, #12] subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_CBC_decrypt_loop_block_128 b L_AES_CBC_decrypt_end L_AES_CBC_decrypt_end_odd: - ldr lr, [sp, #20] + ldr r4, [sp, #4] #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [lr, #16] - ldr r9, [lr, #20] + ldr r8, [r4, #16] + ldr r9, [r4, #20] #else - ldrd r8, r9, [lr, #16] + ldrd r8, r9, [r4, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [lr, #24] - ldr r11, [lr, #28] + ldr r10, [r4, #24] + ldr r11, [r4, #28] #else - ldrd r10, r11, [lr, #24] + ldrd r10, r11, [r4, #24] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [lr] - str r9, [lr, #4] + str r8, [r4] + str r9, [r4, #4] #else - strd r8, r9, [lr] + strd r8, r9, [r4] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [lr, #8] - str r11, [lr, #12] + str r10, [r4, #8] + str r11, [r4, #12] #else - strd r10, r11, [lr, #8] + strd r10, r11, [r4, #8] #endif L_AES_CBC_decrypt_end: - ldr lr, [sp, #4] - add sp, sp, #24 + pop {r3, r4} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size AES_CBC_decrypt,.-AES_CBC_decrypt #endif /* HAVE_AES_CBC */ @@ -3223,159 +3104,162 @@ L_GCM_gmult_len_start_block: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size GCM_gmult_len,.-GCM_gmult_len .text + .type L_AES_ARM32_te_gcm, %object + .size L_AES_ARM32_te_gcm, 12 + .align 4 +L_AES_ARM32_te_gcm: + .word L_AES_ARM32_te_data + .text .align 4 .globl AES_GCM_encrypt .type AES_GCM_encrypt, %function AES_GCM_encrypt: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ldr r12, [sp, #36] - ldr lr, [sp, #40] - ldm lr, {r4, r5, r6, r7} + ldr r8, [sp, #40] + mov lr, r0 + ldr r0, L_AES_ARM32_te_gcm + ldm r8, {r4, r5, r6, r7} rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - stm lr, {r4, r5, r6, r7} - push {lr} - adr r8, AES_GCM_encrypt - mov lr, #AES_GCM_encrypt-L_AES_ARM32_te - sub lr, r8, lr + stm r8, {r4, r5, r6, r7} + push {r3, r8} cmp r12, #10 beq L_AES_GCM_encrypt_start_block_128 cmp r12, #12 beq L_AES_GCM_encrypt_start_block_192 - mov r12, #0xff L_AES_GCM_encrypt_loop_block_256: - push {r0, r1} - ldr r0, [sp, #8] + push {r1, r2, lr} + ldr lr, [sp, #16] add r7, r7, #1 - str r7, [r0, #12] - push {r3, lr} ldm r3!, {r8, r9, r10, r11} + str r7, [lr, #12] # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_14 - pop {r3, lr} - pop {r0, r1} + mov r1, #6 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r8 eor r5, r9 eor r6, r10 eor r7, r11 + ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] str r7, [r1, #12] - ldr r8, [sp] ldm r8, {r4, r5, r6, r7} subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_GCM_encrypt_loop_block_256 b L_AES_GCM_encrypt_end L_AES_GCM_encrypt_start_block_192: - mov r12, #0xff L_AES_GCM_encrypt_loop_block_192: - push {r0, r1} - ldr r0, [sp, #8] + push {r1, r2, lr} + ldr lr, [sp, #16] add r7, r7, #1 - str r7, [r0, #12] - push {r3, lr} ldm r3!, {r8, r9, r10, r11} + str r7, [lr, #12] # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_12 - pop {r3, lr} - pop {r0, r1} + mov r1, #5 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r8 eor r5, r9 eor r6, r10 eor r7, r11 + ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] str r7, [r1, #12] - ldr r8, [sp] ldm r8, {r4, r5, r6, r7} subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_GCM_encrypt_loop_block_192 b L_AES_GCM_encrypt_end L_AES_GCM_encrypt_start_block_128: - mov r12, #0xff L_AES_GCM_encrypt_loop_block_128: - push {r0, r1} - ldr r0, [sp, #8] + push {r1, r2, lr} + ldr lr, [sp, #16] add r7, r7, #1 - str r7, [r0, #12] - push {r3, lr} ldm r3!, {r8, r9, r10, r11} + str r7, [lr, #12] # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 eor r6, r6, r10 eor r7, r7, r11 - bl L_AES_encrypt_block_10 - pop {r3, lr} - pop {r0, r1} + mov r1, #4 + bl AES_encrypt_block + pop {r1, r2, lr} + ldr r3, [sp] rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - ldr r8, [r0] - ldr r9, [r0, #4] - ldr r10, [r0, #8] - ldr r11, [r0, #12] + ldr r8, [lr] + ldr r9, [lr, #4] + ldr r10, [lr, #8] + ldr r11, [lr, #12] eor r4, r8 eor r5, r9 eor r6, r10 eor r7, r11 + ldr r8, [sp, #4] str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] str r7, [r1, #12] - ldr r8, [sp] ldm r8, {r4, r5, r6, r7} subs r2, r2, #16 - add r0, r0, #16 + add lr, lr, #16 add r1, r1, #16 bne L_AES_GCM_encrypt_loop_block_128 L_AES_GCM_encrypt_end: - pop {lr} + pop {r3, r8} rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 - stm lr, {r4, r5, r6, r7} + stm r8, {r4, r5, r6, r7} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size AES_GCM_encrypt,.-AES_GCM_encrypt #endif /* HAVE_AESGCM */ #endif /* !NO_AES */ -#endif /* !__aarch64__ */ +#endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c new file mode 100644 index 0000000000..84e0ef62c2 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -0,0 +1,2744 @@ +/* armv8-32-aes-asm + * + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./aes/aes.rb arm32 ../wolfssl/wolfcrypt/src/port/arm/armv8-32-aes-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#ifdef WOLFSSL_ARMASM_INLINE +#ifndef NO_AES +#include + +static const uint32_t L_AES_ARM32_td_data[] = { + 0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e, + 0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303, + 0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c, + 0xfc4fe5d7, 0xd7c52acb, 0x80263544, 0x8fb562a3, + 0x49deb15a, 0x6725ba1b, 0x9845ea0e, 0xe15dfec0, + 0x02c32f75, 0x12814cf0, 0xa38d4697, 0xc66bd3f9, + 0xe7038f5f, 0x9515929c, 0xebbf6d7a, 0xda955259, + 0x2dd4be83, 0xd3587421, 0x2949e069, 0x448ec9c8, + 0x6a75c289, 0x78f48e79, 0x6b99583e, 0xdd27b971, + 0xb6bee14f, 0x17f088ad, 0x66c920ac, 0xb47dce3a, + 0x1863df4a, 0x82e51a31, 0x60975133, 0x4562537f, + 0xe0b16477, 0x84bb6bae, 0x1cfe81a0, 0x94f9082b, + 0x58704868, 0x198f45fd, 0x8794de6c, 0xb7527bf8, + 0x23ab73d3, 0xe2724b02, 0x57e31f8f, 0x2a6655ab, + 0x07b2eb28, 0x032fb5c2, 0x9a86c57b, 0xa5d33708, + 0xf2302887, 0xb223bfa5, 0xba02036a, 0x5ced1682, + 0x2b8acf1c, 0x92a779b4, 0xf0f307f2, 0xa14e69e2, + 0xcd65daf4, 0xd50605be, 0x1fd13462, 0x8ac4a6fe, + 0x9d342e53, 0xa0a2f355, 0x32058ae1, 0x75a4f6eb, + 0x390b83ec, 0xaa4060ef, 0x065e719f, 0x51bd6e10, + 0xf93e218a, 0x3d96dd06, 0xaedd3e05, 0x464de6bd, + 0xb591548d, 0x0571c45d, 0x6f0406d4, 0xff605015, + 0x241998fb, 0x97d6bde9, 0xcc894043, 0x7767d99e, + 0xbdb0e842, 0x8807898b, 0x38e7195b, 0xdb79c8ee, + 0x47a17c0a, 0xe97c420f, 0xc9f8841e, 0x00000000, + 0x83098086, 0x48322bed, 0xac1e1170, 0x4e6c5a72, + 0xfbfd0eff, 0x560f8538, 0x1e3daed5, 0x27362d39, + 0x640a0fd9, 0x21685ca6, 0xd19b5b54, 0x3a24362e, + 0xb10c0a67, 0x0f9357e7, 0xd2b4ee96, 0x9e1b9b91, + 0x4f80c0c5, 0xa261dc20, 0x695a774b, 0x161c121a, + 0x0ae293ba, 0xe5c0a02a, 0x433c22e0, 0x1d121b17, + 0x0b0e090d, 0xadf28bc7, 0xb92db6a8, 0xc8141ea9, + 0x8557f119, 0x4caf7507, 0xbbee99dd, 0xfda37f60, + 0x9ff70126, 0xbc5c72f5, 0xc544663b, 0x345bfb7e, + 0x768b4329, 0xdccb23c6, 0x68b6edfc, 0x63b8e4f1, + 0xcad731dc, 0x10426385, 0x40139722, 0x2084c611, + 0x7d854a24, 0xf8d2bb3d, 0x11aef932, 0x6dc729a1, + 0x4b1d9e2f, 0xf3dcb230, 0xec0d8652, 0xd077c1e3, + 0x6c2bb316, 0x99a970b9, 0xfa119448, 0x2247e964, + 0xc4a8fc8c, 0x1aa0f03f, 0xd8567d2c, 0xef223390, + 0xc787494e, 0xc1d938d1, 0xfe8ccaa2, 0x3698d40b, + 0xcfa6f581, 0x28a57ade, 0x26dab78e, 0xa43fadbf, + 0xe42c3a9d, 0x0d507892, 0x9b6a5fcc, 0x62547e46, + 0xc2f68d13, 0xe890d8b8, 0x5e2e39f7, 0xf582c3af, + 0xbe9f5d80, 0x7c69d093, 0xa96fd52d, 0xb3cf2512, + 0x3bc8ac99, 0xa710187d, 0x6ee89c63, 0x7bdb3bbb, + 0x09cd2678, 0xf46e5918, 0x01ec9ab7, 0xa8834f9a, + 0x65e6956e, 0x7eaaffe6, 0x0821bccf, 0xe6ef15e8, + 0xd9bae79b, 0xce4a6f36, 0xd4ea9f09, 0xd629b07c, + 0xaf31a4b2, 0x312a3f23, 0x30c6a594, 0xc035a266, + 0x37744ebc, 0xa6fc82ca, 0xb0e090d0, 0x1533a7d8, + 0x4af10498, 0xf741ecda, 0x0e7fcd50, 0x2f1791f6, + 0x8d764dd6, 0x4d43efb0, 0x54ccaa4d, 0xdfe49604, + 0xe39ed1b5, 0x1b4c6a88, 0xb8c12c1f, 0x7f466551, + 0x049d5eea, 0x5d018c35, 0x73fa8774, 0x2efb0b41, + 0x5ab3671d, 0x5292dbd2, 0x33e91056, 0x136dd647, + 0x8c9ad761, 0x7a37a10c, 0x8e59f814, 0x89eb133c, + 0xeecea927, 0x35b761c9, 0xede11ce5, 0x3c7a47b1, + 0x599cd2df, 0x3f55f273, 0x791814ce, 0xbf73c737, + 0xea53f7cd, 0x5b5ffdaa, 0x14df3d6f, 0x867844db, + 0x81caaff3, 0x3eb968c4, 0x2c382434, 0x5fc2a340, + 0x72161dc3, 0x0cbce225, 0x8b283c49, 0x41ff0d95, + 0x7139a801, 0xde080cb3, 0x9cd8b4e4, 0x906456c1, + 0x617bcb84, 0x70d532b6, 0x74486c5c, 0x42d0b857, +}; + +static const uint32_t L_AES_ARM32_te_data[] = { + 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, + 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, + 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, + 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676, + 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, + 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0, + 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, + 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0, + 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, + 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc, + 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, + 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515, + 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, + 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a, + 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, + 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575, + 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, + 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0, + 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, + 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484, + 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, + 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b, + 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, + 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf, + 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, + 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585, + 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, + 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8, + 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, + 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5, + 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, + 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2, + 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, + 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717, + 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, + 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373, + 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, + 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888, + 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, + 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb, + 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, + 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c, + 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, + 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979, + 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, + 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9, + 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, + 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808, + 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, + 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6, + 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, + 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a, + 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, + 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e, + 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, + 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e, + 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, + 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494, + 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, + 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf, + 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, + 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868, + 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, + 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, +}; + +static const uint32_t* L_AES_ARM32_td = L_AES_ARM32_td_data; +static const uint32_t* L_AES_ARM32_te = L_AES_ARM32_te_data; +#ifdef HAVE_AES_DECRYPT +void AES_invert_key(unsigned char* ks, word32 rounds); +void AES_invert_key(unsigned char* ks_p, word32 rounds_p) +{ + register unsigned char* ks asm ("r0") = ks_p; + register word32 rounds asm ("r1") = rounds_p; + + __asm__ __volatile__ ( + "ldr r12, %[L_AES_ARM32_te]\n\t" + "ldr lr, %[L_AES_ARM32_td]\n\t" + "add r10, %[ks], %[rounds], lsl #4\n\t" + "mov r11, %[rounds]\n\t" + "\n" + "L_AES_invert_key_loop_%=: \n\t" + "ldm %[ks], {r2, r3, r4, r5}\n\t" + "ldm r10, {r6, r7, r8, r9}\n\t" + "stm r10, {r2, r3, r4, r5}\n\t" + "stm %[ks]!, {r6, r7, r8, r9}\n\t" + "subs r11, r11, #2\n\t" + "sub r10, r10, #16\n\t" + "bne L_AES_invert_key_loop_%=\n\t" + "sub %[ks], %[ks], %[rounds], lsl #3\n\t" + "add %[ks], %[ks], #16\n\t" + "sub r11, %[rounds], #1\n\t" + "\n" + "L_AES_invert_key_mix_loop_%=: \n\t" + "ldm %[ks], {r2, r3, r4, r5}\n\t" + "ubfx r6, r2, #0, #8\n\t" + "ubfx r7, r2, #8, #8\n\t" + "ubfx r8, r2, #16, #8\n\t" + "lsr r9, r2, #24\n\t" + "ldrb r6, [r12, r6, lsl #2]\n\t" + "ldrb r7, [r12, r7, lsl #2]\n\t" + "ldrb r8, [r12, r8, lsl #2]\n\t" + "ldrb r9, [r12, r9, lsl #2]\n\t" + "ldr r6, [lr, r6, lsl #2]\n\t" + "ldr r7, [lr, r7, lsl #2]\n\t" + "ldr r8, [lr, r8, lsl #2]\n\t" + "ldr r9, [lr, r9, lsl #2]\n\t" + "eor r8, r8, r6, ror #16\n\t" + "eor r8, r8, r7, ror #8\n\t" + "eor r8, r8, r9, ror #24\n\t" + "str r8, [%[ks]], #4\n\t" + "ubfx r6, r3, #0, #8\n\t" + "ubfx r7, r3, #8, #8\n\t" + "ubfx r8, r3, #16, #8\n\t" + "lsr r9, r3, #24\n\t" + "ldrb r6, [r12, r6, lsl #2]\n\t" + "ldrb r7, [r12, r7, lsl #2]\n\t" + "ldrb r8, [r12, r8, lsl #2]\n\t" + "ldrb r9, [r12, r9, lsl #2]\n\t" + "ldr r6, [lr, r6, lsl #2]\n\t" + "ldr r7, [lr, r7, lsl #2]\n\t" + "ldr r8, [lr, r8, lsl #2]\n\t" + "ldr r9, [lr, r9, lsl #2]\n\t" + "eor r8, r8, r6, ror #16\n\t" + "eor r8, r8, r7, ror #8\n\t" + "eor r8, r8, r9, ror #24\n\t" + "str r8, [%[ks]], #4\n\t" + "ubfx r6, r4, #0, #8\n\t" + "ubfx r7, r4, #8, #8\n\t" + "ubfx r8, r4, #16, #8\n\t" + "lsr r9, r4, #24\n\t" + "ldrb r6, [r12, r6, lsl #2]\n\t" + "ldrb r7, [r12, r7, lsl #2]\n\t" + "ldrb r8, [r12, r8, lsl #2]\n\t" + "ldrb r9, [r12, r9, lsl #2]\n\t" + "ldr r6, [lr, r6, lsl #2]\n\t" + "ldr r7, [lr, r7, lsl #2]\n\t" + "ldr r8, [lr, r8, lsl #2]\n\t" + "ldr r9, [lr, r9, lsl #2]\n\t" + "eor r8, r8, r6, ror #16\n\t" + "eor r8, r8, r7, ror #8\n\t" + "eor r8, r8, r9, ror #24\n\t" + "str r8, [%[ks]], #4\n\t" + "ubfx r6, r5, #0, #8\n\t" + "ubfx r7, r5, #8, #8\n\t" + "ubfx r8, r5, #16, #8\n\t" + "lsr r9, r5, #24\n\t" + "ldrb r6, [r12, r6, lsl #2]\n\t" + "ldrb r7, [r12, r7, lsl #2]\n\t" + "ldrb r8, [r12, r8, lsl #2]\n\t" + "ldrb r9, [r12, r9, lsl #2]\n\t" + "ldr r6, [lr, r6, lsl #2]\n\t" + "ldr r7, [lr, r7, lsl #2]\n\t" + "ldr r8, [lr, r8, lsl #2]\n\t" + "ldr r9, [lr, r9, lsl #2]\n\t" + "eor r8, r8, r6, ror #16\n\t" + "eor r8, r8, r7, ror #8\n\t" + "eor r8, r8, r9, ror #24\n\t" + "str r8, [%[ks]], #4\n\t" + "subs r11, r11, #1\n\t" + "bne L_AES_invert_key_mix_loop_%=\n\t" + : [ks] "+r" (ks), [rounds] "+r" (rounds) + : [L_AES_ARM32_te] "g" (L_AES_ARM32_te), [L_AES_ARM32_td] "g" (L_AES_ARM32_td) + : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +#endif /* HAVE_AES_DECRYPT */ +static const uint32_t L_AES_ARM32_rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1b000000, 0x36000000, +}; + +void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks); +void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char* ks_p) +{ + register const unsigned char* key asm ("r0") = key_p; + register word32 len asm ("r1") = len_p; + register unsigned char* ks asm ("r2") = ks_p; + + __asm__ __volatile__ ( + "mov r8, %[L_AES_ARM32_te]\n\t" + "mov lr, %[L_AES_ARM32_rcon]\n\t" + "cmp %[len], #0x80\n\t" + "beq L_AES_set_encrypt_key_start_128_%=\n\t" + "cmp %[len], #0xc0\n\t" + "beq L_AES_set_encrypt_key_start_192_%=\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" +#else + "ldrd r4, r5, [%[key]]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [%[key], #8]\n\t" + "ldr r7, [%[key], #12]\n\t" +#else + "ldrd r6, r7, [%[key], #8]\n\t" +#endif + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "stm %[ks]!, {r4, r5, r6, r7}\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [%[key], #16]\n\t" + "ldr r5, [%[key], #20]\n\t" +#else + "ldrd r4, r5, [%[key], #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [%[key], #24]\n\t" + "ldr r7, [%[key], #28]\n\t" +#else + "ldrd r6, r7, [%[key], #24]\n\t" +#endif + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "stm %[ks], {r4, r5, r6, r7}\n\t" + "sub %[ks], %[ks], #16\n\t" + "mov r12, #6\n\t" + "\n" + "L_AES_set_encrypt_key_loop_256_%=: \n\t" + "ubfx r4, r7, #0, #8\n\t" + "ubfx r5, r7, #8, #8\n\t" + "ubfx r6, r7, #16, #8\n\t" + "lsr r7, r7, #24\n\t" + "ldrb r4, [r8, r4, lsl #2]\n\t" + "ldrb r5, [r8, r5, lsl #2]\n\t" + "ldrb r6, [r8, r6, lsl #2]\n\t" + "ldrb r7, [r8, r7, lsl #2]\n\t" + "eor r3, r7, r4, lsl #8\n\t" + "eor r3, r3, r5, lsl #16\n\t" + "eor r3, r3, r6, lsl #24\n\t" + "ldm %[ks]!, {r4, r5, r6, r7}\n\t" + "eor r4, r4, r3\n\t" + "ldm lr!, {r3}\n\t" + "eor r4, r4, r3\n\t" + "eor r5, r5, r4\n\t" + "eor r6, r6, r5\n\t" + "eor r7, r7, r6\n\t" + "add %[ks], %[ks], #16\n\t" + "stm %[ks], {r4, r5, r6, r7}\n\t" + "sub %[ks], %[ks], #16\n\t" + "mov r3, r7\n\t" + "ubfx r4, r3, #8, #8\n\t" + "ubfx r5, r3, #16, #8\n\t" + "lsr r6, r3, #24\n\t" + "ubfx r3, r3, #0, #8\n\t" + "ldrb r4, [r8, r4, lsl #2]\n\t" + "ldrb r6, [r8, r6, lsl #2]\n\t" + "ldrb r5, [r8, r5, lsl #2]\n\t" + "ldrb r3, [r8, r3, lsl #2]\n\t" + "eor r3, r3, r4, lsl #8\n\t" + "eor r3, r3, r5, lsl #16\n\t" + "eor r3, r3, r6, lsl #24\n\t" + "ldm %[ks]!, {r4, r5, r6, r7}\n\t" + "eor r4, r4, r3\n\t" + "eor r5, r5, r4\n\t" + "eor r6, r6, r5\n\t" + "eor r7, r7, r6\n\t" + "add %[ks], %[ks], #16\n\t" + "stm %[ks], {r4, r5, r6, r7}\n\t" + "sub %[ks], %[ks], #16\n\t" + "subs r12, r12, #1\n\t" + "bne L_AES_set_encrypt_key_loop_256_%=\n\t" + "ubfx r4, r7, #0, #8\n\t" + "ubfx r5, r7, #8, #8\n\t" + "ubfx r6, r7, #16, #8\n\t" + "lsr r7, r7, #24\n\t" + "ldrb r4, [r8, r4, lsl #2]\n\t" + "ldrb r5, [r8, r5, lsl #2]\n\t" + "ldrb r6, [r8, r6, lsl #2]\n\t" + "ldrb r7, [r8, r7, lsl #2]\n\t" + "eor r3, r7, r4, lsl #8\n\t" + "eor r3, r3, r5, lsl #16\n\t" + "eor r3, r3, r6, lsl #24\n\t" + "ldm %[ks]!, {r4, r5, r6, r7}\n\t" + "eor r4, r4, r3\n\t" + "ldm lr!, {r3}\n\t" + "eor r4, r4, r3\n\t" + "eor r5, r5, r4\n\t" + "eor r6, r6, r5\n\t" + "eor r7, r7, r6\n\t" + "add %[ks], %[ks], #16\n\t" + "stm %[ks], {r4, r5, r6, r7}\n\t" + "sub %[ks], %[ks], #16\n\t" + "b L_AES_set_encrypt_key_end_%=\n\t" + "\n" + "L_AES_set_encrypt_key_start_192_%=: \n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" +#else + "ldrd r4, r5, [%[key]]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [%[key], #8]\n\t" + "ldr r7, [%[key], #12]\n\t" +#else + "ldrd r6, r7, [%[key], #8]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr %[key], [%[key], #16]\n\t" + "ldr %[len], [%[key], #20]\n\t" +#else + "ldrd %[key], %[len], [%[key], #16]\n\t" +#endif + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "rev %[key], %[key]\n\t" + "rev %[len], %[len]\n\t" + "stm %[ks], {r4, r5, r6, r7}\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str %[key], [%[ks], #16]\n\t" + "str %[len], [%[ks], #20]\n\t" +#else + "strd %[key], %[len], [%[ks], #16]\n\t" +#endif + "mov r7, %[len]\n\t" + "mov r12, #7\n\t" + "\n" + "L_AES_set_encrypt_key_loop_192_%=: \n\t" + "ubfx r0, r7, #0, #8\n\t" + "ubfx r1, r7, #8, #8\n\t" + "ubfx r4, r7, #16, #8\n\t" + "lsr r7, r7, #24\n\t" + "ldrb r0, [r8, r0, lsl #2]\n\t" + "ldrb r1, [r8, r1, lsl #2]\n\t" + "ldrb r4, [r8, r4, lsl #2]\n\t" + "ldrb r7, [r8, r7, lsl #2]\n\t" + "eor r3, r7, r0, lsl #8\n\t" + "eor r3, r3, r1, lsl #16\n\t" + "eor r3, r3, r4, lsl #24\n\t" + "ldm %[ks]!, {r0, r1, r4, r5, r6, r7}\n\t" + "eor r0, r0, r3\n\t" + "ldm lr!, {r3}\n\t" + "eor r0, r0, r3\n\t" + "eor r1, r1, r0\n\t" + "eor r4, r4, r1\n\t" + "eor r5, r5, r4\n\t" + "eor r6, r6, r5\n\t" + "eor r7, r7, r6\n\t" + "stm %[ks], {r0, r1, r4, r5, r6, r7}\n\t" + "subs r12, r12, #1\n\t" + "bne L_AES_set_encrypt_key_loop_192_%=\n\t" + "ubfx r0, r7, #0, #8\n\t" + "ubfx r1, r7, #8, #8\n\t" + "ubfx r4, r7, #16, #8\n\t" + "lsr r7, r7, #24\n\t" + "ldrb r0, [r8, r0, lsl #2]\n\t" + "ldrb r1, [r8, r1, lsl #2]\n\t" + "ldrb r4, [r8, r4, lsl #2]\n\t" + "ldrb r7, [r8, r7, lsl #2]\n\t" + "eor r3, r7, r0, lsl #8\n\t" + "eor r3, r3, r1, lsl #16\n\t" + "eor r3, r3, r4, lsl #24\n\t" + "ldm %[ks]!, {r0, r1, r4, r5, r6, r7}\n\t" + "eor r0, r0, r3\n\t" + "ldm lr!, {r3}\n\t" + "eor r0, r0, r3\n\t" + "eor r1, r1, r0\n\t" + "eor r4, r4, r1\n\t" + "eor r5, r5, r4\n\t" + "stm %[ks], {r0, r1, r4, r5}\n\t" + "b L_AES_set_encrypt_key_end_%=\n\t" + "\n" + "L_AES_set_encrypt_key_start_128_%=: \n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" +#else + "ldrd r4, r5, [%[key]]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [%[key], #8]\n\t" + "ldr r7, [%[key], #12]\n\t" +#else + "ldrd r6, r7, [%[key], #8]\n\t" +#endif + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "stm %[ks], {r4, r5, r6, r7}\n\t" + "mov r12, #10\n\t" + "\n" + "L_AES_set_encrypt_key_loop_128_%=: \n\t" + "ubfx r4, r7, #0, #8\n\t" + "ubfx r5, r7, #8, #8\n\t" + "ubfx r6, r7, #16, #8\n\t" + "lsr r7, r7, #24\n\t" + "ldrb r4, [r8, r4, lsl #2]\n\t" + "ldrb r5, [r8, r5, lsl #2]\n\t" + "ldrb r6, [r8, r6, lsl #2]\n\t" + "ldrb r7, [r8, r7, lsl #2]\n\t" + "eor r3, r7, r4, lsl #8\n\t" + "eor r3, r3, r5, lsl #16\n\t" + "eor r3, r3, r6, lsl #24\n\t" + "ldm %[ks]!, {r4, r5, r6, r7}\n\t" + "eor r4, r4, r3\n\t" + "ldm lr!, {r3}\n\t" + "eor r4, r4, r3\n\t" + "eor r5, r5, r4\n\t" + "eor r6, r6, r5\n\t" + "eor r7, r7, r6\n\t" + "stm %[ks], {r4, r5, r6, r7}\n\t" + "subs r12, r12, #1\n\t" + "bne L_AES_set_encrypt_key_loop_128_%=\n\t" + "\n" + "L_AES_set_encrypt_key_end_%=: \n\t" + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_ARM32_te] "g" (L_AES_ARM32_te), [L_AES_ARM32_rcon] "g" (L_AES_ARM32_rcon) + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8" + ); +} + +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks); +void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t* ks_p) +{ + register const uint32_t* te asm ("r0") = te_p; + register int nr asm ("r1") = nr_p; + register int len asm ("r2") = len_p; + register const uint32_t* ks asm ("r3") = ks_p; + + __asm__ __volatile__ ( + "\n" + "L_AES_encrypt_block_nr_%=: \n\t" + "ubfx r8, r5, #16, #8\n\t" + "lsr r11, r4, #24\n\t" + "ubfx lr, r6, #8, #8\n\t" + "ubfx r2, r7, #0, #8\n\t" + "ldr r8, [%[te], r8, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r9, r6, #16, #8\n\t" + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" + "ubfx lr, r7, #8, #8\n\t" + "eor r8, r8, r2, ror #16\n\t" + "ubfx r2, r4, #0, #8\n\t" + "ldr r9, [%[te], r9, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r10, r7, #16, #8\n\t" + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" + "ubfx lr, r4, #8, #8\n\t" + "eor r9, r9, r2, ror #16\n\t" + "ubfx r2, r5, #0, #8\n\t" + "ldr r10, [%[te], r10, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r6, r6, #0, #8\n\t" + "eor r10, r10, r11, ror #24\n\t" + "ubfx r11, r4, #16, #8\n\t" + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" + "ubfx r2, r5, #8, #8\n\t" + "ldr r6, [%[te], r6, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[ks]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "ubfx r4, r9, #16, #8\n\t" + "lsr r7, r8, #24\n\t" + "ubfx lr, r10, #8, #8\n\t" + "ubfx r2, r11, #0, #8\n\t" + "ldr r4, [%[te], r4, lsl #2]\n\t" + "ldr r7, [%[te], r7, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r5, r10, #16, #8\n\t" + "eor r4, r4, r7, ror #24\n\t" + "lsr r7, r9, #24\n\t" + "eor r4, r4, lr, ror #8\n\t" + "ubfx lr, r11, #8, #8\n\t" + "eor r4, r4, r2, ror #16\n\t" + "ubfx r2, r8, #0, #8\n\t" + "ldr r5, [%[te], r5, lsl #2]\n\t" + "ldr r7, [%[te], r7, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r6, r11, #16, #8\n\t" + "eor r5, r5, r7, ror #24\n\t" + "lsr r7, r10, #24\n\t" + "eor r5, r5, lr, ror #8\n\t" + "ubfx lr, r8, #8, #8\n\t" + "eor r5, r5, r2, ror #16\n\t" + "ubfx r2, r9, #0, #8\n\t" + "ldr r6, [%[te], r6, lsl #2]\n\t" + "ldr r7, [%[te], r7, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r10, r10, #0, #8\n\t" + "eor r6, r6, r7, ror #24\n\t" + "ubfx r7, r8, #16, #8\n\t" + "eor r6, r6, lr, ror #8\n\t" + "lsr lr, r11, #24\n\t" + "eor r6, r6, r2, ror #16\n\t" + "ubfx r2, r9, #8, #8\n\t" + "ldr r10, [%[te], r10, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r7, [%[te], r7, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "eor lr, lr, r10, ror #24\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, ror #24\n\t" + "eor r7, r7, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "subs %[nr], %[nr], #1\n\t" + "bne L_AES_encrypt_block_nr_%=\n\t" + "ubfx r8, r5, #16, #8\n\t" + "lsr r11, r4, #24\n\t" + "ubfx lr, r6, #8, #8\n\t" + "ubfx r2, r7, #0, #8\n\t" + "ldr r8, [%[te], r8, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r9, r6, #16, #8\n\t" + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" + "ubfx lr, r7, #8, #8\n\t" + "eor r8, r8, r2, ror #16\n\t" + "ubfx r2, r4, #0, #8\n\t" + "ldr r9, [%[te], r9, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r10, r7, #16, #8\n\t" + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" + "ubfx lr, r4, #8, #8\n\t" + "eor r9, r9, r2, ror #16\n\t" + "ubfx r2, r5, #0, #8\n\t" + "ldr r10, [%[te], r10, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "ubfx r6, r6, #0, #8\n\t" + "eor r10, r10, r11, ror #24\n\t" + "ubfx r11, r4, #16, #8\n\t" + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" + "ubfx r2, r5, #8, #8\n\t" + "ldr r6, [%[te], r6, lsl #2]\n\t" + "ldr lr, [%[te], lr, lsl #2]\n\t" + "ldr r11, [%[te], r11, lsl #2]\n\t" + "ldr r2, [%[te], r2, lsl #2]\n\t" + "eor lr, lr, r6, ror #24\n\t" + "ldm %[ks]!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, lr, ror #24\n\t" + "eor r11, r11, r2, ror #8\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "ubfx r4, r11, #0, #8\n\t" + "ubfx r7, r10, #8, #8\n\t" + "ubfx lr, r9, #16, #8\n\t" + "lsr r2, r8, #24\n\t" + "ldrb r4, [%[te], r4, lsl #2]\n\t" + "ldrb r7, [%[te], r7, lsl #2]\n\t" + "ldrb lr, [%[te], lr, lsl #2]\n\t" + "ldrb r2, [%[te], r2, lsl #2]\n\t" + "ubfx r5, r8, #0, #8\n\t" + "eor r4, r4, r7, lsl #8\n\t" + "ubfx r7, r11, #8, #8\n\t" + "eor r4, r4, lr, lsl #16\n\t" + "ubfx lr, r10, #16, #8\n\t" + "eor r4, r4, r2, lsl #24\n\t" + "lsr r2, r9, #24\n\t" + "ldrb r5, [%[te], r5, lsl #2]\n\t" + "ldrb r7, [%[te], r7, lsl #2]\n\t" + "ldrb lr, [%[te], lr, lsl #2]\n\t" + "ldrb r2, [%[te], r2, lsl #2]\n\t" + "ubfx r6, r9, #0, #8\n\t" + "eor r5, r5, r7, lsl #8\n\t" + "ubfx r7, r8, #8, #8\n\t" + "eor r5, r5, lr, lsl #16\n\t" + "ubfx lr, r11, #16, #8\n\t" + "eor r5, r5, r2, lsl #24\n\t" + "lsr r2, r10, #24\n\t" + "ldrb r6, [%[te], r6, lsl #2]\n\t" + "ldrb r7, [%[te], r7, lsl #2]\n\t" + "ldrb lr, [%[te], lr, lsl #2]\n\t" + "ldrb r2, [%[te], r2, lsl #2]\n\t" + "lsr r11, r11, #24\n\t" + "eor r6, r6, r7, lsl #8\n\t" + "ubfx r7, r10, #0, #8\n\t" + "eor r6, r6, lr, lsl #16\n\t" + "ubfx lr, r9, #8, #8\n\t" + "eor r6, r6, r2, lsl #24\n\t" + "ubfx r2, r8, #16, #8\n\t" + "ldrb r11, [%[te], r11, lsl #2]\n\t" + "ldrb r7, [%[te], r7, lsl #2]\n\t" + "ldrb lr, [%[te], lr, lsl #2]\n\t" + "ldrb r2, [%[te], r2, lsl #2]\n\t" + "eor lr, lr, r11, lsl #16\n\t" + "ldm %[ks], {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, lsl #8\n\t" + "eor r7, r7, r2, lsl #16\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + : [te] "+r" (te), [nr] "+r" (nr), [len] "+r" (len), [ks] "+r" (ks) + : + : "memory", "lr" + ); +} + +static const uint32_t* L_AES_ARM32_te_ecb = L_AES_ARM32_te_data; +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr); +void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p) +{ + register const unsigned char* in asm ("r0") = in_p; + register unsigned char* out asm ("r1") = out_p; + register unsigned long len asm ("r2") = len_p; + register const unsigned char* ks asm ("r3") = ks_p; + register int nr asm ("r4") = nr_p; + + __asm__ __volatile__ ( + "mov lr, %[in]\n\t" + "ldr r0, %[L_AES_ARM32_te_ecb]\n\t" + "mov r12, r4\n\t" + "push {%[ks]}\n\t" + "cmp r12, #10\n\t" + "beq L_AES_ECB_encrypt_start_block_128_%=\n\t" + "cmp r12, #12\n\t" + "beq L_AES_ECB_encrypt_start_block_192_%=\n\t" + "\n" + "L_AES_ECB_encrypt_loop_block_256_%=: \n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "push {r1, %[len], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_ECB_encrypt_loop_block_256_%=\n\t" + "b L_AES_ECB_encrypt_end_%=\n\t" + "\n" + "L_AES_ECB_encrypt_start_block_192_%=: \n\t" + "\n" + "L_AES_ECB_encrypt_loop_block_192_%=: \n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "push {r1, %[len], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_ECB_encrypt_loop_block_192_%=\n\t" + "b L_AES_ECB_encrypt_end_%=\n\t" + "\n" + "L_AES_ECB_encrypt_start_block_128_%=: \n\t" + "\n" + "L_AES_ECB_encrypt_loop_block_128_%=: \n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "push {r1, %[len], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_ECB_encrypt_loop_block_128_%=\n\t" + "\n" + "L_AES_ECB_encrypt_end_%=: \n\t" + "pop {%[ks]}\n\t" + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr) + : [L_AES_ARM32_te_ecb] "g" (L_AES_ARM32_te_ecb) + : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)nr; +} + +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_CBC +void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p) +{ + register const unsigned char* in asm ("r0") = in_p; + register unsigned char* out asm ("r1") = out_p; + register unsigned long len asm ("r2") = len_p; + register const unsigned char* ks asm ("r3") = ks_p; + register int nr asm ("r4") = nr_p; + register unsigned char* iv asm ("r5") = iv_p; + + __asm__ __volatile__ ( + "mov r8, r4\n\t" + "mov r9, r5\n\t" + "mov lr, %[in]\n\t" + "ldr r0, %[L_AES_ARM32_te_ecb]\n\t" + "ldm r9, {r4, r5, r6, r7}\n\t" + "push {%[ks], r9}\n\t" + "cmp r8, #10\n\t" + "beq L_AES_CBC_encrypt_start_block_128_%=\n\t" + "cmp r8, #12\n\t" + "beq L_AES_CBC_encrypt_start_block_192_%=\n\t" + "\n" + "L_AES_CBC_encrypt_loop_block_256_%=: \n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "push {r1, %[len], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CBC_encrypt_loop_block_256_%=\n\t" + "b L_AES_CBC_encrypt_end_%=\n\t" + "\n" + "L_AES_CBC_encrypt_start_block_192_%=: \n\t" + "\n" + "L_AES_CBC_encrypt_loop_block_192_%=: \n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "push {r1, %[len], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CBC_encrypt_loop_block_192_%=\n\t" + "b L_AES_CBC_encrypt_end_%=\n\t" + "\n" + "L_AES_CBC_encrypt_start_block_128_%=: \n\t" + "\n" + "L_AES_CBC_encrypt_loop_block_128_%=: \n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "push {r1, %[len], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CBC_encrypt_loop_block_128_%=\n\t" + "\n" + "L_AES_CBC_encrypt_end_%=: \n\t" + "pop {%[ks], r9}\n\t" + "stm r9, {r4, r5, r6, r7}\n\t" + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv) + : [L_AES_ARM32_te_ecb] "g" (L_AES_ARM32_te_ecb) + : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)nr; + (void)iv; +} + +#endif /* HAVE_AES_CBC */ +#ifdef WOLFSSL_AES_COUNTER +void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p) +{ + register const unsigned char* in asm ("r0") = in_p; + register unsigned char* out asm ("r1") = out_p; + register unsigned long len asm ("r2") = len_p; + register const unsigned char* ks asm ("r3") = ks_p; + register int nr asm ("r4") = nr_p; + register unsigned char* ctr asm ("r5") = ctr_p; + + __asm__ __volatile__ ( + "mov r12, r4\n\t" + "mov r8, r5\n\t" + "mov lr, %[in]\n\t" + "ldr r0, %[L_AES_ARM32_te_ecb]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "stm r8, {r4, r5, r6, r7}\n\t" + "push {%[ks], r8}\n\t" + "cmp r12, #10\n\t" + "beq L_AES_CTR_encrypt_start_block_128_%=\n\t" + "cmp r12, #12\n\t" + "beq L_AES_CTR_encrypt_start_block_192_%=\n\t" + "\n" + "L_AES_CTR_encrypt_loop_block_256_%=: \n\t" + "push {r1, %[len], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "adds r11, r7, #1\n\t" + "adcs r10, r6, #0\n\t" + "adcs r9, r5, #0\n\t" + "adc r8, r4, #0\n\t" + "stm lr, {r8, r9, r10, r11}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r8\n\t" + "eor r5, r9\n\t" + "eor r6, r10\n\t" + "eor r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CTR_encrypt_loop_block_256_%=\n\t" + "b L_AES_CTR_encrypt_end_%=\n\t" + "\n" + "L_AES_CTR_encrypt_start_block_192_%=: \n\t" + "\n" + "L_AES_CTR_encrypt_loop_block_192_%=: \n\t" + "push {r1, %[len], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "adds r11, r7, #1\n\t" + "adcs r10, r6, #0\n\t" + "adcs r9, r5, #0\n\t" + "adc r8, r4, #0\n\t" + "stm lr, {r8, r9, r10, r11}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r8\n\t" + "eor r5, r9\n\t" + "eor r6, r10\n\t" + "eor r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CTR_encrypt_loop_block_192_%=\n\t" + "b L_AES_CTR_encrypt_end_%=\n\t" + "\n" + "L_AES_CTR_encrypt_start_block_128_%=: \n\t" + "\n" + "L_AES_CTR_encrypt_loop_block_128_%=: \n\t" + "push {r1, %[len], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "adds r11, r7, #1\n\t" + "adcs r10, r6, #0\n\t" + "adcs r9, r5, #0\n\t" + "adc r8, r4, #0\n\t" + "stm lr, {r8, r9, r10, r11}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r8\n\t" + "eor r5, r9\n\t" + "eor r6, r10\n\t" + "eor r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CTR_encrypt_loop_block_128_%=\n\t" + "\n" + "L_AES_CTR_encrypt_end_%=: \n\t" + "pop {%[ks], r8}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "stm r8, {r4, r5, r6, r7}\n\t" + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [L_AES_ARM32_te_ecb] "g" (L_AES_ARM32_te_ecb) + : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)nr; + (void)ctr; +} + +#endif /* WOLFSSL_AES_COUNTER */ +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_CBC) +void AES_decrypt_block(const uint32_t* td, int nr); +void AES_decrypt_block(const uint32_t* td_p, int nr_p) +{ + register const uint32_t* td asm ("r0") = td_p; + register int nr asm ("r1") = nr_p; + + __asm__ __volatile__ ( + "\n" + "L_AES_decrypt_block_nr_%=: \n\t" + "ubfx r8, r7, #16, #8\n\t" + "lsr r11, r4, #24\n\t" + "ubfx lr, r6, #8, #8\n\t" + "ubfx r2, r5, #0, #8\n\t" + "ldr r8, [%[td], r8, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r9, r4, #16, #8\n\t" + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" + "ubfx lr, r7, #8, #8\n\t" + "eor r8, r8, r2, ror #16\n\t" + "ubfx r2, r6, #0, #8\n\t" + "ldr r9, [%[td], r9, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r10, r5, #16, #8\n\t" + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" + "ubfx lr, r4, #8, #8\n\t" + "eor r9, r9, r2, ror #16\n\t" + "ubfx r2, r7, #0, #8\n\t" + "ldr r10, [%[td], r10, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r4, r4, #0, #8\n\t" + "eor r10, r10, r11, ror #24\n\t" + "ubfx r11, r6, #16, #8\n\t" + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" + "ubfx r2, r5, #8, #8\n\t" + "ldr r4, [%[td], r4, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "eor lr, lr, r4, ror #24\n\t" + "ldm r3!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, r2, ror #8\n\t" + "eor r11, r11, lr, ror #24\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "ubfx r4, r11, #16, #8\n\t" + "lsr r7, r8, #24\n\t" + "ubfx lr, r10, #8, #8\n\t" + "ubfx r2, r9, #0, #8\n\t" + "ldr r4, [%[td], r4, lsl #2]\n\t" + "ldr r7, [%[td], r7, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r5, r8, #16, #8\n\t" + "eor r4, r4, r7, ror #24\n\t" + "lsr r7, r9, #24\n\t" + "eor r4, r4, lr, ror #8\n\t" + "ubfx lr, r11, #8, #8\n\t" + "eor r4, r4, r2, ror #16\n\t" + "ubfx r2, r10, #0, #8\n\t" + "ldr r5, [%[td], r5, lsl #2]\n\t" + "ldr r7, [%[td], r7, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r6, r9, #16, #8\n\t" + "eor r5, r5, r7, ror #24\n\t" + "lsr r7, r10, #24\n\t" + "eor r5, r5, lr, ror #8\n\t" + "ubfx lr, r8, #8, #8\n\t" + "eor r5, r5, r2, ror #16\n\t" + "ubfx r2, r11, #0, #8\n\t" + "ldr r6, [%[td], r6, lsl #2]\n\t" + "ldr r7, [%[td], r7, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r8, r8, #0, #8\n\t" + "eor r6, r6, r7, ror #24\n\t" + "ubfx r7, r10, #16, #8\n\t" + "eor r6, r6, lr, ror #8\n\t" + "lsr lr, r11, #24\n\t" + "eor r6, r6, r2, ror #16\n\t" + "ubfx r2, r9, #8, #8\n\t" + "ldr r8, [%[td], r8, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r7, [%[td], r7, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "eor lr, lr, r8, ror #24\n\t" + "ldm r3!, {r8, r9, r10, r11}\n\t" + "eor r7, r7, r2, ror #8\n\t" + "eor r7, r7, lr, ror #24\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "subs %[nr], %[nr], #1\n\t" + "bne L_AES_decrypt_block_nr_%=\n\t" + "ubfx r8, r7, #16, #8\n\t" + "lsr r11, r4, #24\n\t" + "ubfx lr, r6, #8, #8\n\t" + "ubfx r2, r5, #0, #8\n\t" + "ldr r8, [%[td], r8, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r9, r4, #16, #8\n\t" + "eor r8, r8, r11, ror #24\n\t" + "lsr r11, r5, #24\n\t" + "eor r8, r8, lr, ror #8\n\t" + "ubfx lr, r7, #8, #8\n\t" + "eor r8, r8, r2, ror #16\n\t" + "ubfx r2, r6, #0, #8\n\t" + "ldr r9, [%[td], r9, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r10, r5, #16, #8\n\t" + "eor r9, r9, r11, ror #24\n\t" + "lsr r11, r6, #24\n\t" + "eor r9, r9, lr, ror #8\n\t" + "ubfx lr, r4, #8, #8\n\t" + "eor r9, r9, r2, ror #16\n\t" + "ubfx r2, r7, #0, #8\n\t" + "ldr r10, [%[td], r10, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "ubfx r4, r4, #0, #8\n\t" + "eor r10, r10, r11, ror #24\n\t" + "ubfx r11, r6, #16, #8\n\t" + "eor r10, r10, lr, ror #8\n\t" + "lsr lr, r7, #24\n\t" + "eor r10, r10, r2, ror #16\n\t" + "ubfx r2, r5, #8, #8\n\t" + "ldr r4, [%[td], r4, lsl #2]\n\t" + "ldr lr, [%[td], lr, lsl #2]\n\t" + "ldr r11, [%[td], r11, lsl #2]\n\t" + "ldr r2, [%[td], r2, lsl #2]\n\t" + "eor lr, lr, r4, ror #24\n\t" + "ldm r3!, {r4, r5, r6, r7}\n\t" + "eor r11, r11, r2, ror #8\n\t" + "eor r11, r11, lr, ror #24\n\t" + /* XOR in Key Schedule */ + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "ubfx r4, r9, #0, #8\n\t" + "ubfx r7, r10, #8, #8\n\t" + "ubfx lr, r11, #16, #8\n\t" + "lsr r2, r8, #24\n\t" + "ldrb r4, [r12, r4]\n\t" + "ldrb r7, [r12, r7]\n\t" + "ldrb lr, [r12, lr]\n\t" + "ldrb r2, [r12, r2]\n\t" + "ubfx r5, r10, #0, #8\n\t" + "eor r4, r4, r7, lsl #8\n\t" + "ubfx r7, r11, #8, #8\n\t" + "eor r4, r4, lr, lsl #16\n\t" + "ubfx lr, r8, #16, #8\n\t" + "eor r4, r4, r2, lsl #24\n\t" + "lsr r2, r9, #24\n\t" + "ldrb r7, [r12, r7]\n\t" + "ldrb r2, [r12, r2]\n\t" + "ldrb r5, [r12, r5]\n\t" + "ldrb lr, [r12, lr]\n\t" + "ubfx r6, r11, #0, #8\n\t" + "eor r5, r5, r7, lsl #8\n\t" + "ubfx r7, r8, #8, #8\n\t" + "eor r5, r5, lr, lsl #16\n\t" + "ubfx lr, r9, #16, #8\n\t" + "eor r5, r5, r2, lsl #24\n\t" + "lsr r2, r10, #24\n\t" + "ldrb r7, [r12, r7]\n\t" + "ldrb r2, [r12, r2]\n\t" + "ldrb r6, [r12, r6]\n\t" + "ldrb lr, [r12, lr]\n\t" + "lsr r11, r11, #24\n\t" + "eor r6, r6, r7, lsl #8\n\t" + "ubfx r7, r8, #0, #8\n\t" + "eor r6, r6, lr, lsl #16\n\t" + "ubfx lr, r9, #8, #8\n\t" + "eor r6, r6, r2, lsl #24\n\t" + "ubfx r2, r10, #16, #8\n\t" + "ldrb r11, [r12, r11]\n\t" + "ldrb lr, [r12, lr]\n\t" + "ldrb r7, [r12, r7]\n\t" + "ldrb r2, [r12, r2]\n\t" + "eor lr, lr, r11, lsl #16\n\t" + "ldm r3, {r8, r9, r10, r11}\n\t" + "eor r7, r7, lr, lsl #8\n\t" + "eor r7, r7, r2, lsl #16\n\t" + /* XOR in Key Schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + : [td] "+r" (td), [nr] "+r" (nr) + : + : "memory", "lr" + ); +} + +static const uint32_t* L_AES_ARM32_td_ecb = L_AES_ARM32_td_data; +static const unsigned char L_AES_ARM32_td4[] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, +}; + +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr); +void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p) +{ + register const unsigned char* in asm ("r0") = in_p; + register unsigned char* out asm ("r1") = out_p; + register unsigned long len asm ("r2") = len_p; + register const unsigned char* ks asm ("r3") = ks_p; + register int nr asm ("r4") = nr_p; + + __asm__ __volatile__ ( + "mov r8, r4\n\t" + "mov lr, %[in]\n\t" + "ldr r0, %[L_AES_ARM32_td_ecb]\n\t" + "ldr r12, %[L_AES_ARM32_td4]\n\t" + "cmp r8, #10\n\t" + "beq L_AES_ECB_decrypt_start_block_128_%=\n\t" + "cmp r8, #12\n\t" + "beq L_AES_ECB_decrypt_start_block_192_%=\n\t" + "\n" + "L_AES_ECB_decrypt_loop_block_256_%=: \n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "push {r1, r2, %[ks], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" + "bl AES_decrypt_block\n\t" + "pop {r1, r2, %[ks], lr}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_ECB_decrypt_loop_block_256_%=\n\t" + "b L_AES_ECB_decrypt_end_%=\n\t" + "\n" + "L_AES_ECB_decrypt_start_block_192_%=: \n\t" + "\n" + "L_AES_ECB_decrypt_loop_block_192_%=: \n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "push {r1, r2, %[ks], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" + "bl AES_decrypt_block\n\t" + "pop {r1, r2, %[ks], lr}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_ECB_decrypt_loop_block_192_%=\n\t" + "b L_AES_ECB_decrypt_end_%=\n\t" + "\n" + "L_AES_ECB_decrypt_start_block_128_%=: \n\t" + "\n" + "L_AES_ECB_decrypt_loop_block_128_%=: \n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "push {r1, r2, %[ks], lr}\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" + "bl AES_decrypt_block\n\t" + "pop {r1, r2, %[ks], lr}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_ECB_decrypt_loop_block_128_%=\n\t" + "\n" + "L_AES_ECB_decrypt_end_%=: \n\t" + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr) + : [L_AES_ARM32_td_ecb] "g" (L_AES_ARM32_td_ecb), [L_AES_ARM32_td4] "g" (L_AES_ARM32_td4) + : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)nr; +} + +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_CBC +void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p) +{ + register const unsigned char* in asm ("r0") = in_p; + register unsigned char* out asm ("r1") = out_p; + register unsigned long len asm ("r2") = len_p; + register const unsigned char* ks asm ("r3") = ks_p; + register int nr asm ("r4") = nr_p; + register unsigned char* iv asm ("r5") = iv_p; + + __asm__ __volatile__ ( + "mov r8, r4\n\t" + "mov r4, r5\n\t" + "mov lr, %[in]\n\t" + "ldr r0, %[L_AES_ARM32_td_ecb]\n\t" + "ldr r12, %[L_AES_ARM32_td4]\n\t" + "push {%[ks]-r4}\n\t" + "cmp r8, #10\n\t" + "beq L_AES_CBC_decrypt_loop_block_128_%=\n\t" + "cmp r8, #12\n\t" + "beq L_AES_CBC_decrypt_loop_block_192_%=\n\t" + "\n" + "L_AES_CBC_decrypt_loop_block_256_%=: \n\t" + "push {r1, r2, lr}\n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r4, [lr, #16]\n\t" + "str r5, [lr, #20]\n\t" +#else + "strd r4, r5, [lr, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r6, [lr, #24]\n\t" + "str r7, [lr, #28]\n\t" +#else + "strd r6, r7, [lr, #24]\n\t" +#endif + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" + "bl AES_decrypt_block\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldm lr, {r8, r9, r10, r11}\n\t" + "pop {r1, r2, lr}\n\t" + "ldr %[ks], [sp]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "beq L_AES_CBC_decrypt_end_odd_%=\n\t" + "push {r1, r2, lr}\n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r4, [lr]\n\t" + "str r5, [lr, #4]\n\t" +#else + "strd r4, r5, [lr]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r6, [lr, #8]\n\t" + "str r7, [lr, #12]\n\t" +#else + "strd r6, r7, [lr, #8]\n\t" +#endif + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" + "bl AES_decrypt_block\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r8, [lr, #16]\n\t" + "ldr r9, [lr, #20]\n\t" +#else + "ldrd r8, r9, [lr, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r10, [lr, #24]\n\t" + "ldr r11, [lr, #28]\n\t" +#else + "ldrd r10, r11, [lr, #24]\n\t" +#endif + "pop {r1, r2, lr}\n\t" + "ldr %[ks], [sp]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CBC_decrypt_loop_block_256_%=\n\t" + "b L_AES_CBC_decrypt_end_%=\n\t" + "\n" + "L_AES_CBC_decrypt_loop_block_192_%=: \n\t" + "push {r1, r2, lr}\n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r4, [lr, #16]\n\t" + "str r5, [lr, #20]\n\t" +#else + "strd r4, r5, [lr, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r6, [lr, #24]\n\t" + "str r7, [lr, #28]\n\t" +#else + "strd r6, r7, [lr, #24]\n\t" +#endif + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" + "bl AES_decrypt_block\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldm lr, {r8, r9, r10, r11}\n\t" + "pop {r1, r2, lr}\n\t" + "ldr %[ks], [sp]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "beq L_AES_CBC_decrypt_end_odd_%=\n\t" + "push {r1, r2, lr}\n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r4, [lr]\n\t" + "str r5, [lr, #4]\n\t" +#else + "strd r4, r5, [lr]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r6, [lr, #8]\n\t" + "str r7, [lr, #12]\n\t" +#else + "strd r6, r7, [lr, #8]\n\t" +#endif + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" + "bl AES_decrypt_block\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r8, [lr, #16]\n\t" + "ldr r9, [lr, #20]\n\t" +#else + "ldrd r8, r9, [lr, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r10, [lr, #24]\n\t" + "ldr r11, [lr, #28]\n\t" +#else + "ldrd r10, r11, [lr, #24]\n\t" +#endif + "pop {r1, r2, lr}\n\t" + "ldr %[ks], [sp]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CBC_decrypt_loop_block_192_%=\n\t" + "b L_AES_CBC_decrypt_end_%=\n\t" + "\n" + "L_AES_CBC_decrypt_loop_block_128_%=: \n\t" + "push {r1, r2, lr}\n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r4, [lr, #16]\n\t" + "str r5, [lr, #20]\n\t" +#else + "strd r4, r5, [lr, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r6, [lr, #24]\n\t" + "str r7, [lr, #28]\n\t" +#else + "strd r6, r7, [lr, #24]\n\t" +#endif + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" + "bl AES_decrypt_block\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldm lr, {r8, r9, r10, r11}\n\t" + "pop {r1, r2, lr}\n\t" + "ldr %[ks], [sp]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "beq L_AES_CBC_decrypt_end_odd_%=\n\t" + "push {r1, r2, lr}\n\t" + "ldr r4, [lr]\n\t" + "ldr r5, [lr, #4]\n\t" + "ldr r6, [lr, #8]\n\t" + "ldr r7, [lr, #12]\n\t" + "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r4, [lr]\n\t" + "str r5, [lr, #4]\n\t" +#else + "strd r4, r5, [lr]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r6, [lr, #8]\n\t" + "str r7, [lr, #12]\n\t" +#else + "strd r6, r7, [lr, #8]\n\t" +#endif + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" + "bl AES_decrypt_block\n\t" + "ldr lr, [sp, #16]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r8, [lr, #16]\n\t" + "ldr r9, [lr, #20]\n\t" +#else + "ldrd r8, r9, [lr, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r10, [lr, #24]\n\t" + "ldr r11, [lr, #28]\n\t" +#else + "ldrd r10, r11, [lr, #24]\n\t" +#endif + "pop {r1, r2, lr}\n\t" + "ldr %[ks], [sp]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_CBC_decrypt_loop_block_128_%=\n\t" + "b L_AES_CBC_decrypt_end_%=\n\t" + "\n" + "L_AES_CBC_decrypt_end_odd_%=: \n\t" + "ldr r4, [sp, #4]\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r8, [r4, #16]\n\t" + "ldr r9, [r4, #20]\n\t" +#else + "ldrd r8, r9, [r4, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r10, [r4, #24]\n\t" + "ldr r11, [r4, #28]\n\t" +#else + "ldrd r10, r11, [r4, #24]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r8, [r4]\n\t" + "str r9, [r4, #4]\n\t" +#else + "strd r8, r9, [r4]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r10, [r4, #8]\n\t" + "str r11, [r4, #12]\n\t" +#else + "strd r10, r11, [r4, #8]\n\t" +#endif + "\n" + "L_AES_CBC_decrypt_end_%=: \n\t" + "pop {%[ks]-r4}\n\t" + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv) + : [L_AES_ARM32_td_ecb] "g" (L_AES_ARM32_td_ecb), [L_AES_ARM32_td4] "g" (L_AES_ARM32_td4) + : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)nr; + (void)iv; +} + +#endif /* HAVE_AES_CBC */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC */ +#endif /* HAVE_AES_DECRYPT */ +#ifdef HAVE_AESGCM +static const uint32_t L_GCM_gmult_len_r[] = { + 0x00000000, 0x1c200000, 0x38400000, 0x24600000, + 0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000, + 0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000, + 0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000, +}; + +void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned char* data, unsigned long len); +void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned char* data_p, unsigned long len_p) +{ + register unsigned char* x asm ("r0") = x_p; + register const unsigned char** m asm ("r1") = m_p; + register const unsigned char* data asm ("r2") = data_p; + register unsigned long len asm ("r3") = len_p; + + __asm__ __volatile__ ( + "ldr lr, %[L_GCM_gmult_len_r]\n\t" + "\n" + "L_GCM_gmult_len_start_block_%=: \n\t" + "push {r3}\n\t" + "ldr r12, [r0, #12]\n\t" + "ldr %[len], [r2, #12]\n\t" + "eor r12, r12, %[len]\n\t" + "lsr %[len], r12, #24\n\t" + "and %[len], %[len], #15\n\t" + "add %[len], %[m], %[len], lsl #4\n\t" + "ldm %[len], {r8, r9, r10, r11}\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #28\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #16\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #20\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #8\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #12\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "and r4, r12, #15\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #4\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "ldr r12, [r0, #8]\n\t" + "ldr %[len], [r2, #8]\n\t" + "eor r12, r12, %[len]\n\t" + "lsr %[len], r12, #24\n\t" + "and %[len], %[len], #15\n\t" + "add %[len], %[m], %[len], lsl #4\n\t" + "ldm %[len], {r4, r5, r6, r7}\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #28\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #16\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #20\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #8\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #12\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "and r4, r12, #15\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #4\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "ldr r12, [r0, #4]\n\t" + "ldr %[len], [r2, #4]\n\t" + "eor r12, r12, %[len]\n\t" + "lsr %[len], r12, #24\n\t" + "and %[len], %[len], #15\n\t" + "add %[len], %[m], %[len], lsl #4\n\t" + "ldm %[len], {r4, r5, r6, r7}\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #28\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #16\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #20\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #8\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #12\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "and r4, r12, #15\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #4\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "ldr r12, [r0]\n\t" + "ldr %[len], [r2]\n\t" + "eor r12, r12, %[len]\n\t" + "lsr %[len], r12, #24\n\t" + "and %[len], %[len], #15\n\t" + "add %[len], %[m], %[len], lsl #4\n\t" + "ldm %[len], {r4, r5, r6, r7}\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #28\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #16\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #20\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #8\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #12\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "and r4, r12, #15\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "lsr r6, r10, #4\n\t" + "and %[len], r11, #15\n\t" + "lsr r11, r11, #4\n\t" + "lsr r4, r12, #4\n\t" + "eor r11, r11, r10, lsl #28\n\t" + "and r4, r4, #15\n\t" + "ldr %[len], [lr, r3, lsl #2]\n\t" + "add r4, %[m], r4, lsl #4\n\t" + "eor r10, r6, r9, lsl #28\n\t" + "lsr r9, r9, #4\n\t" + "ldm r4, {r4, r5, r6, r7}\n\t" + "eor r9, r9, r8, lsl #28\n\t" + "eor r8, %[len], r8, lsr #4\n\t" + "eor r8, r8, r4\n\t" + "eor r9, r9, r5\n\t" + "eor r10, r10, r6\n\t" + "eor r11, r11, r7\n\t" + "rev r8, r8\n\t" + "rev r9, r9\n\t" + "rev r10, r10\n\t" + "rev r11, r11\n\t" + "stm %[x], {r8, r9, r10, r11}\n\t" + "pop {r3}\n\t" + "subs %[len], %[len], #16\n\t" + "add %[data], %[data], #16\n\t" + "bne L_GCM_gmult_len_start_block_%=\n\t" + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len) + : [L_AES_ARM32_td_ecb] "g" (L_AES_ARM32_td_ecb), [L_AES_ARM32_td4] "g" (L_AES_ARM32_td4), [L_GCM_gmult_len_r] "g" (L_GCM_gmult_len_r) + : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +static const uint32_t* L_AES_ARM32_te_gcm = L_AES_ARM32_te_data; +void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p) +{ + register const unsigned char* in asm ("r0") = in_p; + register unsigned char* out asm ("r1") = out_p; + register unsigned long len asm ("r2") = len_p; + register const unsigned char* ks asm ("r3") = ks_p; + register int nr asm ("r4") = nr_p; + register unsigned char* ctr asm ("r5") = ctr_p; + + __asm__ __volatile__ ( + "mov r12, r4\n\t" + "mov r8, r5\n\t" + "mov lr, %[in]\n\t" + "ldr r0, %[L_AES_ARM32_te_gcm]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "stm r8, {r4, r5, r6, r7}\n\t" + "push {%[ks], r8}\n\t" + "cmp r12, #10\n\t" + "beq L_AES_GCM_encrypt_start_block_128_%=\n\t" + "cmp r12, #12\n\t" + "beq L_AES_GCM_encrypt_start_block_192_%=\n\t" + "\n" + "L_AES_GCM_encrypt_loop_block_256_%=: \n\t" + "push {r1, %[len], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "add r7, r7, #1\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "str r7, [lr, #12]\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #6\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r8\n\t" + "eor r5, r9\n\t" + "eor r6, r10\n\t" + "eor r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_GCM_encrypt_loop_block_256_%=\n\t" + "b L_AES_GCM_encrypt_end_%=\n\t" + "\n" + "L_AES_GCM_encrypt_start_block_192_%=: \n\t" + "\n" + "L_AES_GCM_encrypt_loop_block_192_%=: \n\t" + "push {r1, %[len], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "add r7, r7, #1\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "str r7, [lr, #12]\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #5\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r8\n\t" + "eor r5, r9\n\t" + "eor r6, r10\n\t" + "eor r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_GCM_encrypt_loop_block_192_%=\n\t" + "b L_AES_GCM_encrypt_end_%=\n\t" + "\n" + "L_AES_GCM_encrypt_start_block_128_%=: \n\t" + "\n" + "L_AES_GCM_encrypt_loop_block_128_%=: \n\t" + "push {r1, %[len], lr}\n\t" + "ldr lr, [sp, #16]\n\t" + "add r7, r7, #1\n\t" + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" + "str r7, [lr, #12]\n\t" + /* Round: 0 - XOR in key schedule */ + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r10\n\t" + "eor r7, r7, r11\n\t" + "mov r1, #4\n\t" + "bl AES_encrypt_block\n\t" + "pop {r1, %[len], lr}\n\t" + "ldr %[ks], [sp]\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "ldr r8, [lr]\n\t" + "ldr r9, [lr, #4]\n\t" + "ldr r10, [lr, #8]\n\t" + "ldr r11, [lr, #12]\n\t" + "eor r4, r8\n\t" + "eor r5, r9\n\t" + "eor r6, r10\n\t" + "eor r7, r11\n\t" + "ldr r8, [sp, #4]\n\t" + "str r4, [%[out]]\n\t" + "str r5, [%[out], #4]\n\t" + "str r6, [%[out], #8]\n\t" + "str r7, [%[out], #12]\n\t" + "ldm r8, {r4, r5, r6, r7}\n\t" + "subs %[len], %[len], #16\n\t" + "add lr, lr, #16\n\t" + "add %[out], %[out], #16\n\t" + "bne L_AES_GCM_encrypt_loop_block_128_%=\n\t" + "\n" + "L_AES_GCM_encrypt_end_%=: \n\t" + "pop {%[ks], r8}\n\t" + "rev r4, r4\n\t" + "rev r5, r5\n\t" + "rev r6, r6\n\t" + "rev r7, r7\n\t" + "stm r8, {r4, r5, r6, r7}\n\t" + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [L_AES_ARM32_te_gcm] "g" (L_AES_ARM32_te_gcm) + : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" + ); + (void)nr; + (void)ctr; +} + +#endif /* HAVE_AESGCM */ +#endif /* !NO_AES */ +#endif /* !__aarch64__ && !__thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S index 24072bd6dc..1f83fcc625 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -30,8 +30,14 @@ #include #ifdef WOLFSSL_ARMASM -#ifndef __aarch64__ -#ifdef HAVE_CURVE25519 +#if !defined(__aarch64__) && defined(__arm__) +#ifndef WOLFSSL_ARMASM_INLINE +/* Based on work by: Emil Lenngren + * https://github.com/pornin/X25519-Cortex-M4 + */ + +#if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) +#if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) .text .align 4 @@ -42,118 +48,338 @@ fe_init: .size fe_init,.-fe_init .text .align 4 - .globl fe_frombytes - .type fe_frombytes, %function -fe_frombytes: - push {r4, r5, r6, r7, r8, r9, lr} - ldr r2, [r1] - ldr r3, [r1, #4] - ldr r4, [r1, #8] - ldr r5, [r1, #12] - ldr r6, [r1, #16] - ldr r7, [r1, #20] - ldr r8, [r1, #24] - ldr r9, [r1, #28] - and r9, r9, #0x7fffffff + .globl fe_add_sub_op + .type fe_add_sub_op, %function +fe_add_sub_op: + push {lr} + # Add-Sub #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0] - str r3, [r0, #4] + ldr r4, [r2] + ldr r5, [r2, #4] #else - strd r2, r3, [r0] + ldrd r4, r5, [r2] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] + ldr r6, [r3] + ldr r7, [r3, #4] #else - strd r4, r5, [r0, #8] + ldrd r6, r7, [r3] #endif + # Add + adds r8, r4, r6 + mov r12, #0 + adcs r9, r5, r7 + adc r12, r12, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #16] - str r7, [r0, #20] + str r8, [r0] + str r9, [r0, #4] #else - strd r6, r7, [r0, #16] + strd r8, r9, [r0] #endif + # Sub + subs r10, r4, r6 + sbcs r11, r5, r7 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] + str r10, [r1] + str r11, [r1, #4] #else - strd r8, r9, [r0, #24] + strd r10, r11, [r1] #endif - pop {r4, r5, r6, r7, r8, r9, pc} - .size fe_frombytes,.-fe_frombytes - .text - .align 4 - .globl fe_tobytes - .type fe_tobytes, %function -fe_tobytes: - push {r4, r5, r6, r7, r8, r9, lr} #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r2, [r1] - ldr r3, [r1, #4] + ldr r4, [r2, #8] + ldr r5, [r2, #12] #else - ldrd r2, r3, [r1] + ldrd r4, r5, [r2, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] + ldr r6, [r3, #8] + ldr r7, [r3, #12] #else - ldrd r4, r5, [r1, #8] + ldrd r6, r7, [r3, #8] +#endif + # Sub + sbcs r10, r4, r6 + mov lr, #0 + sbcs r11, r5, r7 + adc lr, lr, #0 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + str r10, [r1, #8] + str r11, [r1, #12] +#else + strd r10, r11, [r1, #8] #endif + # Add + subs r12, r12, #1 + adcs r8, r4, r6 + adcs r9, r5, r7 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #16] - ldr r7, [r1, #20] + str r8, [r0, #8] + str r9, [r0, #12] #else - ldrd r6, r7, [r1, #16] + strd r8, r9, [r0, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1, #24] - ldr r9, [r1, #28] + ldr r4, [r2, #16] + ldr r5, [r2, #20] #else - ldrd r8, r9, [r1, #24] + ldrd r4, r5, [r2, #16] #endif - adds r12, r2, #19 - adcs r12, r3, #0 - adcs r12, r4, #0 - adcs r12, r5, #0 - adcs r12, r6, #0 - adcs r12, r7, #0 - adcs r12, r8, #0 - adc r12, r9, #0 - asr r12, r12, #31 - and r12, r12, #19 - adds r2, r2, r12 - adcs r3, r3, #0 - adcs r4, r4, #0 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + ldr r6, [r3, #16] + ldr r7, [r3, #20] +#else + ldrd r6, r7, [r3, #16] +#endif + # Add + adcs r8, r4, r6 + mov r12, #0 + adcs r9, r5, r7 + adc r12, r12, #0 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + str r8, [r0, #16] + str r9, [r0, #20] +#else + strd r8, r9, [r0, #16] +#endif + # Sub + subs lr, lr, #1 + sbcs r10, r4, r6 + sbcs r11, r5, r7 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + str r10, [r1, #16] + str r11, [r1, #20] +#else + strd r10, r11, [r1, #16] +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + ldr r4, [r2, #24] + ldr r5, [r2, #28] +#else + ldrd r4, r5, [r2, #24] +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + ldr r6, [r3, #24] + ldr r7, [r3, #28] +#else + ldrd r6, r7, [r3, #24] +#endif + # Sub + sbcs r10, r4, r6 + sbcs r11, r5, r7 + sbc lr, lr, lr + # Add + subs r12, r12, #1 + adcs r8, r4, r6 + mov r12, #0 + adcs r9, r5, r7 + adc r12, r12, #0 + # Multiply -modulus by overflow + lsl r3, r12, #1 + mov r12, #19 + orr r3, r3, r9, lsr #31 + mul r12, r3, r12 + # Add -x*modulus (if overflow) +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + ldr r4, [r0] + ldr r5, [r0, #4] +#else + ldrd r4, r5, [r0] +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + ldr r6, [r0, #8] + ldr r7, [r0, #12] +#else + ldrd r6, r7, [r0, #8] +#endif + adds r4, r4, r12 adcs r5, r5, #0 adcs r6, r6, #0 adcs r7, r7, #0 - adcs r8, r8, #0 - adc r9, r9, #0 - and r9, r9, #0x7fffffff #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0] - str r3, [r0, #4] + str r4, [r0] + str r5, [r0, #4] #else - strd r2, r3, [r0] + strd r4, r5, [r0] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] + str r6, [r0, #8] + str r7, [r0, #12] #else - strd r4, r5, [r0, #8] + strd r6, r7, [r0, #8] +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + ldr r4, [r0, #16] + ldr r5, [r0, #20] +#else + ldrd r4, r5, [r0, #16] #endif + adcs r4, r4, #0 + adcs r5, r5, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #16] - str r7, [r0, #20] + str r4, [r0, #16] + str r5, [r0, #20] #else - strd r6, r7, [r0, #16] + strd r4, r5, [r0, #16] #endif + bfc r9, #31, #1 + adcs r8, r8, #0 + adc r9, r9, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) str r8, [r0, #24] str r9, [r0, #28] #else strd r8, r9, [r0, #24] #endif + # Multiply -modulus by underflow + lsl r3, lr, #1 + mvn lr, #18 + orr r3, r3, r11, lsr #31 + mul lr, r3, lr + # Sub -x*modulus (if overflow) + ldm r1, {r4, r5, r6, r7, r8, r9} + subs r4, r4, lr + sbcs r5, r5, #0 + sbcs r6, r6, #0 + sbcs r7, r7, #0 + sbcs r8, r8, #0 + sbcs r9, r9, #0 + bfc r11, #31, #1 + sbcs r10, r10, #0 + sbc r11, r11, #0 + stm r1, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Add-Sub + pop {pc} + .size fe_add_sub_op,.-fe_add_sub_op + .text + .align 4 + .globl fe_sub_op + .type fe_sub_op, %function +fe_sub_op: + push {lr} + # Sub + ldm r2!, {r6, r7, r8, r9, r10, r11, r12, lr} + ldm r1!, {r2, r3, r4, r5} + subs r6, r2, r6 + sbcs r7, r3, r7 + sbcs r8, r4, r8 + sbcs r9, r5, r9 + ldm r1!, {r2, r3, r4, r5} + sbcs r10, r2, r10 + sbcs r11, r3, r11 + sbcs r12, r4, r12 + sbcs lr, r5, lr + sbc r3, r3, r3 + mvn r2, #18 + lsl r3, r3, #1 + orr r3, r3, lr, lsr #31 + mul r2, r3, r2 + subs r6, r6, r2 + sbcs r7, r7, #0 + sbcs r8, r8, #0 + sbcs r9, r9, #0 + sbcs r10, r10, #0 + sbcs r11, r11, #0 + bfc lr, #31, #1 + sbcs r12, r12, #0 + sbc lr, lr, #0 + stm r0, {r6, r7, r8, r9, r10, r11, r12, lr} + # Done Sub + pop {pc} + .size fe_sub_op,.-fe_sub_op + .text + .align 4 + .globl fe_sub + .type fe_sub, %function +fe_sub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + bl fe_sub_op + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_sub,.-fe_sub + .text + .align 4 + .globl fe_add_op + .type fe_add_op, %function +fe_add_op: + push {lr} + # Add + ldm r2!, {r6, r7, r8, r9, r10, r11, r12, lr} + ldm r1!, {r2, r3, r4, r5} + adds r6, r2, r6 + adcs r7, r3, r7 + adcs r8, r4, r8 + adcs r9, r5, r9 + ldm r1!, {r2, r3, r4, r5} + adcs r10, r2, r10 + adcs r11, r3, r11 + adcs r12, r4, r12 + mov r3, #0 + adcs lr, r5, lr + adc r3, r3, #0 + mov r2, #19 + lsl r3, r3, #1 + orr r3, r3, lr, lsr #31 + mul r2, r3, r2 + adds r6, r6, r2 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + adcs r11, r11, #0 + bfc lr, #31, #1 + adcs r12, r12, #0 + adc lr, lr, #0 + stm r0, {r6, r7, r8, r9, r10, r11, r12, lr} + # Done Add + pop {pc} + .size fe_add_op,.-fe_add_op + .text + .align 4 + .globl fe_add + .type fe_add, %function +fe_add: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + bl fe_add_op + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_add,.-fe_add +#ifdef HAVE_ED25519 + .text + .align 4 + .globl fe_frombytes + .type fe_frombytes, %function +fe_frombytes: + push {r4, r5, r6, r7, r8, r9, lr} + ldm r1, {r2, r3, r4, r5, r6, r7, r8, r9} + bfc r9, #31, #1 + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + pop {r4, r5, r6, r7, r8, r9, pc} + .size fe_frombytes,.-fe_frombytes + .text + .align 4 + .globl fe_tobytes + .type fe_tobytes, %function +fe_tobytes: + push {r4, r5, r6, r7, r8, r9, lr} + ldm r1, {r2, r3, r4, r5, r6, r7, r8, r9} + adds r12, r2, #19 + adcs r12, r3, #0 + adcs r12, r4, #0 + adcs r12, r5, #0 + adcs r12, r6, #0 + adcs r12, r7, #0 + adcs r12, r8, #0 + adc r12, r9, #0 + asr r12, r12, #31 + and r12, r12, #19 + adds r2, r2, r12 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + bfc r9, #31, #1 + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} pop {r4, r5, r6, r7, r8, r9, pc} .size fe_tobytes,.-fe_tobytes .text @@ -284,329 +510,25 @@ fe_copy: .size fe_copy,.-fe_copy .text .align 4 - .globl fe_sub - .type fe_sub, %function -fe_sub: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 - adcs r6, r6, r3 - adcs r7, r7, r3 - adcs r8, r8, r3 - adcs r9, r9, r3 - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_sub,.-fe_sub - .text - .align 4 - .globl fe_add - .type fe_add, %function -fe_add: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - adds r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - adcs r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_add,.-fe_add - .text - .align 4 .globl fe_neg .type fe_neg, %function fe_neg: push {r4, r5, lr} - mov lr, #-1 - mov r12, #-19 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r2, [r1] - ldr r3, [r1, #4] -#else - ldrd r2, r3, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif + mvn lr, #0 + mvn r12, #18 + ldm r1!, {r2, r3, r4, r5} subs r2, r12, r2 sbcs r3, lr, r3 sbcs r4, lr, r4 sbcs r5, lr, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0] - str r3, [r0, #4] -#else - strd r2, r3, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - mov r12, #0x7fffff - lsl r12, r12, #8 - add r12, r12, #0xff -#else - mov r12, #0x7fffffff -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r2, [r1, #16] - ldr r3, [r1, #20] -#else - ldrd r2, r3, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #24] - ldr r5, [r1, #28] -#else - ldrd r4, r5, [r1, #24] -#endif + stm r0!, {r2, r3, r4, r5} + mvn r12, #0x80000000 + ldm r1!, {r2, r3, r4, r5} sbcs r2, lr, r2 sbcs r3, lr, r3 sbcs r4, lr, r4 sbc r5, r12, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0, #16] - str r3, [r0, #20] -#else - strd r2, r3, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #24] - str r5, [r0, #28] -#else - strd r4, r5, [r0, #24] -#endif + stm r0!, {r2, r3, r4, r5} pop {r4, r5, pc} .size fe_neg,.-fe_neg .text @@ -615,30 +537,7 @@ fe_neg: .type fe_isnonzero, %function fe_isnonzero: push {r4, r5, r6, r7, r8, r9, lr} -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r2, [r0] - ldr r3, [r0, #4] -#else - ldrd r2, r3, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #16] - ldr r7, [r0, #20] -#else - ldrd r6, r7, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r0, #24] - ldr r9, [r0, #28] -#else - ldrd r8, r9, [r0, #24] -#endif + ldm r0, {r2, r3, r4, r5, r6, r7, r8, r9} adds r1, r2, #19 adcs r1, r3, #0 adcs r1, r4, #0 @@ -657,7 +556,7 @@ fe_isnonzero: adcs r7, r7, #0 adcs r8, r8, #0 adc r9, r9, #0 - and r9, r9, #0x7fffffff + bfc r9, #31, #1 orr r2, r2, r3 orr r4, r4, r5 orr r6, r6, r7 @@ -673,44 +572,23 @@ fe_isnonzero: .type fe_isnegative, %function fe_isnegative: push {r4, r5, lr} -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r2, [r0] - ldr r3, [r0, #4] -#else - ldrd r2, r3, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif + ldm r0!, {r2, r3, r4, r5} adds r1, r2, #19 adcs r1, r3, #0 adcs r1, r4, #0 adcs r1, r5, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r2, [r0, #16] - ldr r3, [r0, #20] -#else - ldrd r2, r3, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #24] - ldr r5, [r0, #28] -#else - ldrd r4, r5, [r0, #24] -#endif + ldm r0, {r2, r3, r4, r5} adcs r1, r2, #0 adcs r1, r3, #0 adcs r1, r4, #0 - ldr r2, [r0] + ldr r2, [r0, #-16] adc r1, r5, #0 and r0, r2, #1 lsr r1, r1, #31 eor r0, r0, r1 pop {r4, r5, pc} .size fe_isnegative,.-fe_isnegative +#ifndef WC_NO_CACHE_RESISTANT .text .align 4 .globl fe_cmov_table @@ -1103,8 +981,8 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 sub r1, r1, #0x2a0 - mov r10, #-19 - mov r11, #-1 + mvn r10, #18 + mvn r11, #0 subs r10, r10, r8 sbcs r11, r11, r9 sbc lr, lr, lr @@ -1526,8 +1404,8 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 sub r1, r1, #0x2a0 - mov r10, #-1 - mov r11, #-1 + mvn r10, #0 + mvn r11, #0 rsbs lr, lr, #0 sbcs r10, r10, r8 sbcs r11, r11, r9 @@ -1950,8 +1828,8 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 sub r1, r1, #0x2a0 - mov r10, #-1 - mov r11, #-1 + mvn r10, #0 + mvn r11, #0 rsbs lr, lr, #0 sbcs r10, r10, r8 sbcs r11, r11, r9 @@ -2374,14 +2252,8 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 sub r1, r1, #0x2a0 - mov r10, #-1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - mov r11, #0x7fffff - lsl r11, r11, #8 - add r11, r11, #0xff -#else - mov r11, #0x7fffffff -#endif + mvn r10, #0 + mvn r11, #0x80000000 rsbs lr, lr, #0 sbcs r10, r10, r8 sbc r11, r11, r9 @@ -2420,565 +2292,460 @@ fe_cmov_table: #endif pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_cmov_table,.-fe_cmov_table +#else .text .align 4 - .globl fe_mul - .type fe_mul, %function -fe_mul: + .globl fe_cmov_table + .type fe_cmov_table, %function +fe_cmov_table: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x40 - # Multiply - ldr r7, [r1] - ldr r8, [r1, #4] - ldr r9, [r2] - ldr lr, [r2, #4] - # A[0] * B[0] = 0 - umull r4, r5, r7, r9 - str r4, [sp] - # A[0] * B[1] = 1 - umull r3, r6, r7, lr - adds r5, r5, r3 - adc r6, r6, #0 - # A[1] * B[0] = 1 - umull r3, r12, r8, r9 - adds r5, r5, r3 - mov r4, #0 - adcs r6, r6, r12 - adc r4, r4, #0 - str r5, [sp, #4] - # A[2] * B[0] = 2 - ldr r10, [r1, #8] - umull r3, r12, r10, r9 - adds r6, r6, r3 - adc r4, r4, r12 - # A[1] * B[1] = 2 - umull r3, r12, r8, lr - adds r6, r6, r3 - mov r5, #0 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[0] * B[2] = 2 - ldr r11, [r2, #8] - umull r3, r12, r7, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - str r6, [sp, #8] - # A[0] * B[3] = 3 - ldr r11, [r2, #12] - umull r3, r12, r7, r11 - adds r4, r4, r3 - mov r6, #0 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[1] * B[2] = 3 - ldr r11, [r2, #8] - umull r3, r12, r8, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[2] * B[1] = 3 - umull r3, r12, r10, lr - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[3] * B[0] = 3 - ldr r10, [r1, #12] - umull r3, r12, r10, r9 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - str r4, [sp, #12] - # A[4] * B[0] = 4 - ldr r10, [r1, #16] - umull r3, r12, r10, r9 - adds r5, r5, r3 + sxtb r2, r2 + sbfx r3, r2, #7, #1 + eor r2, r2, r3 + sub r2, r2, r3 + clz lr, r2 + lsl lr, lr, #26 + asr lr, lr, #31 + mvn lr, lr + add r2, r2, lr + mov r12, #0x60 + mul r2, r2, r12 + add r1, r1, r2 + ldm r1!, {r4, r5, r6, r7, r8, r9, r10, r11} + and r4, r4, lr + and r5, r5, lr + and r6, r6, lr + and r7, r7, lr + and r8, r8, lr + and r9, r9, lr + and r10, r10, lr + and r11, r11, lr + mvn r12, lr + sub r4, r4, r12 + mov r12, #32 + and r12, r12, r3 + add r0, r0, r12 + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + sub r0, r0, r12 + ldm r1!, {r4, r5, r6, r7, r8, r9, r10, r11} + and r4, r4, lr + and r5, r5, lr + and r6, r6, lr + and r7, r7, lr + and r8, r8, lr + and r9, r9, lr + and r10, r10, lr + and r11, r11, lr + mvn r12, lr + sub r4, r4, r12 + mov r12, #32 + bic r12, r12, r3 + add r0, r0, r12 + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + sub r0, r0, r12 + add r0, r0, #0x40 + ldm r1!, {r4, r5, r6, r7} + mvn r12, #18 + subs r8, r12, r4 + sbcs r9, r3, r5 + sbcs r10, r3, r6 + sbcs r11, r3, r7 + bic r4, r4, r3 + bic r5, r5, r3 + bic r6, r6, r3 + bic r7, r7, r3 + and r8, r8, r3 + and r9, r9, r3 + and r10, r10, r3 + and r11, r11, r3 + orr r4, r4, r8 + orr r5, r5, r9 + orr r6, r6, r10 + orr r7, r7, r11 + and r4, r4, lr + and r5, r5, lr + and r6, r6, lr + and r7, r7, lr + stm r0!, {r4, r5, r6, r7} + ldm r1!, {r4, r5, r6, r7} + mvn r12, #0x80000000 + sbcs r8, r3, r4 + sbcs r9, r3, r5 + sbcs r10, r3, r6 + sbc r11, r12, r7 + bic r4, r4, r3 + bic r5, r5, r3 + bic r6, r6, r3 + bic r7, r7, r3 + and r8, r8, r3 + and r9, r9, r3 + and r10, r10, r3 + and r11, r11, r3 + orr r4, r4, r8 + orr r5, r5, r9 + orr r6, r6, r10 + orr r7, r7, r11 + and r4, r4, lr + and r5, r5, lr + and r6, r6, lr + and r7, r7, lr + stm r0!, {r4, r5, r6, r7} + sub r1, r1, r2 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_cmov_table,.-fe_cmov_table +#endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_ED25519 */ + .text + .align 4 + .globl fe_mul_op + .type fe_mul_op, %function +fe_mul_op: + push {lr} + sub sp, sp, #44 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + str r0, [sp, #36] + str r1, [sp, #40] +#else + strd r0, r1, [sp, #36] +#endif + mov lr, r2 + ldm r1, {r0, r1, r2, r3} + ldm lr!, {r4, r5, r6} + umull r10, r11, r0, r4 + umull r12, r7, r1, r4 + umaal r11, r12, r0, r5 + umull r8, r9, r2, r4 + umaal r12, r8, r1, r5 + umaal r12, r7, r0, r6 + umaal r8, r9, r3, r4 + stm sp, {r10, r11, r12} + umaal r7, r8, r2, r5 + ldm lr!, {r4} + umull r10, r11, r1, r6 + umaal r8, r9, r2, r6 + umaal r7, r10, r0, r4 + umaal r8, r11, r3, r5 + str r7, [sp, #12] + umaal r8, r10, r1, r4 + umaal r9, r11, r3, r6 + umaal r9, r10, r2, r4 + umaal r10, r11, r3, r4 + ldm lr, {r4, r5, r6, r7} + mov r12, #0 + umlal r8, r12, r0, r4 + umaal r9, r12, r1, r4 + umaal r10, r12, r2, r4 + umaal r11, r12, r3, r4 mov r4, #0 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[3] * B[1] = 4 - ldr r10, [r1, #12] - umull r3, r12, r10, lr - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[2] * B[2] = 4 - ldr r10, [r1, #8] - umull r3, r12, r10, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[1] * B[3] = 4 - ldr r11, [r2, #12] - umull r3, r12, r8, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[0] * B[4] = 4 - ldr r11, [r2, #16] - umull r3, r12, r7, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - str r5, [sp, #16] - # A[0] * B[5] = 5 - ldr r11, [r2, #20] - umull r3, r12, r7, r11 - adds r6, r6, r3 + umlal r9, r4, r0, r5 + umaal r10, r4, r1, r5 + umaal r11, r4, r2, r5 + umaal r12, r4, r3, r5 mov r5, #0 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[1] * B[4] = 5 - ldr r11, [r2, #16] - umull r3, r12, r8, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[2] * B[3] = 5 - ldr r11, [r2, #12] - umull r3, r12, r10, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[3] * B[2] = 5 - ldr r10, [r1, #12] - ldr r11, [r2, #8] - umull r3, r12, r10, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[4] * B[1] = 5 - ldr r10, [r1, #16] - umull r3, r12, r10, lr - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[5] * B[0] = 5 - ldr r10, [r1, #20] - umull r3, r12, r10, r9 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - str r6, [sp, #20] - # A[6] * B[0] = 6 - ldr r10, [r1, #24] - umull r3, r12, r10, r9 - adds r4, r4, r3 + umlal r10, r5, r0, r6 + umaal r11, r5, r1, r6 + umaal r12, r5, r2, r6 + umaal r4, r5, r3, r6 mov r6, #0 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[5] * B[1] = 6 - ldr r10, [r1, #20] - umull r3, r12, r10, lr - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[4] * B[2] = 6 - ldr r10, [r1, #16] - umull r3, r12, r10, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[3] * B[3] = 6 - ldr r10, [r1, #12] - ldr r11, [r2, #12] - umull r3, r12, r10, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[2] * B[4] = 6 - ldr r10, [r1, #8] - ldr r11, [r2, #16] - umull r3, r12, r10, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[1] * B[5] = 6 - ldr r11, [r2, #20] - umull r3, r12, r8, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[0] * B[6] = 6 - ldr r11, [r2, #24] - umull r3, r12, r7, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - str r4, [sp, #24] - # A[0] * B[7] = 7 - ldr r11, [r2, #28] - umull r3, r12, r7, r11 - adds r5, r5, r3 - mov r4, #0 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[1] * B[6] = 7 - ldr r11, [r2, #24] - umull r3, r12, r8, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[2] * B[5] = 7 - ldr r11, [r2, #20] - umull r3, r12, r10, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[3] * B[4] = 7 - ldr r10, [r1, #12] - ldr r11, [r2, #16] - umull r3, r12, r10, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[4] * B[3] = 7 - ldr r10, [r1, #16] - ldr r11, [r2, #12] - umull r3, r12, r10, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[5] * B[2] = 7 - ldr r10, [r1, #20] - ldr r11, [r2, #8] - umull r3, r12, r10, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[6] * B[1] = 7 - ldr r10, [r1, #24] - umull r3, r12, r10, lr - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[7] * B[0] = 7 - ldr r10, [r1, #28] - umull r3, r12, r10, r9 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - str r5, [sp, #28] - ldr r7, [r1, #24] - ldr r9, [r2, #24] - # A[7] * B[1] = 8 - umull r3, r12, r10, lr - adds r6, r6, r3 - mov r5, #0 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[6] * B[2] = 8 - umull r3, r12, r7, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[5] * B[3] = 8 - ldr r10, [r1, #20] - ldr r11, [r2, #12] - umull r3, r12, r10, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[4] * B[4] = 8 - ldr r10, [r1, #16] - ldr r11, [r2, #16] - umull r3, r12, r10, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[3] * B[5] = 8 - ldr r10, [r1, #12] - ldr r11, [r2, #20] - umull r3, r12, r10, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[2] * B[6] = 8 - ldr r10, [r1, #8] - umull r3, r12, r10, r9 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[1] * B[7] = 8 - ldr r11, [r2, #28] - umull r3, r12, r8, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 + umlal r11, r6, r0, r7 + ldr r0, [sp, #40] + umaal r12, r6, r1, r7 + add r0, r0, #16 + umaal r4, r6, r2, r7 + sub lr, lr, #16 + umaal r5, r6, r3, r7 + ldm r0, {r0, r1, r2, r3} str r6, [sp, #32] - ldr r8, [r1, #28] - mov lr, r11 - # A[2] * B[7] = 9 - umull r3, r12, r10, lr - adds r4, r4, r3 - mov r6, #0 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[3] * B[6] = 9 - ldr r10, [r1, #12] - umull r3, r12, r10, r9 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[4] * B[5] = 9 - ldr r10, [r1, #16] - ldr r11, [r2, #20] - umull r3, r12, r10, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[5] * B[4] = 9 - ldr r10, [r1, #20] - ldr r11, [r2, #16] - umull r3, r12, r10, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[6] * B[3] = 9 - ldr r11, [r2, #12] - umull r3, r12, r7, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[7] * B[2] = 9 - ldr r11, [r2, #8] - umull r3, r12, r8, r11 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - str r4, [sp, #36] - # A[7] * B[3] = 10 - ldr r11, [r2, #12] - umull r3, r12, r8, r11 - adds r5, r5, r3 - mov r4, #0 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[6] * B[4] = 10 - ldr r11, [r2, #16] - umull r3, r12, r7, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[5] * B[5] = 10 - ldr r11, [r2, #20] - umull r3, r12, r10, r11 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[4] * B[6] = 10 - ldr r10, [r1, #16] - umull r3, r12, r10, r9 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[3] * B[7] = 10 - ldr r10, [r1, #12] - umull r3, r12, r10, lr - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - str r5, [sp, #40] - # A[4] * B[7] = 11 - ldr r10, [r1, #16] - umull r3, r12, r10, lr - adds r6, r6, r3 - mov r5, #0 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[5] * B[6] = 11 - ldr r10, [r1, #20] - umull r3, r12, r10, r9 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[6] * B[5] = 11 - umull r3, r12, r7, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - # A[7] * B[4] = 11 - ldr r11, [r2, #16] - umull r3, r12, r8, r11 - adds r6, r6, r3 - adcs r4, r4, r12 - adc r5, r5, #0 - str r6, [sp, #44] - # A[7] * B[5] = 12 - ldr r11, [r2, #20] - umull r3, r12, r8, r11 - adds r4, r4, r3 - mov r6, #0 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[6] * B[6] = 12 - umull r3, r12, r7, r9 - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - # A[5] * B[7] = 12 - umull r3, r12, r10, lr - adds r4, r4, r3 - adcs r5, r5, r12 - adc r6, r6, #0 - str r4, [sp, #48] - # A[6] * B[7] = 13 - umull r3, r12, r7, lr - adds r5, r5, r3 - mov r4, #0 - adcs r6, r6, r12 - adc r4, r4, #0 - # A[7] * B[6] = 13 - umull r3, r12, r8, r9 - adds r5, r5, r3 - adcs r6, r6, r12 - adc r4, r4, #0 - str r5, [sp, #52] - # A[7] * B[7] = 14 - umull r3, r12, r8, lr - adds r6, r6, r3 - adc r4, r4, r12 - str r6, [sp, #56] - str r4, [sp, #60] + ldm lr!, {r6} + mov r7, #0 + umlal r8, r7, r0, r6 + umaal r9, r7, r1, r6 + str r8, [sp, #16] + umaal r10, r7, r2, r6 + umaal r11, r7, r3, r6 + ldm lr!, {r6} + mov r8, #0 + umlal r9, r8, r0, r6 + umaal r10, r8, r1, r6 + str r9, [sp, #20] + umaal r11, r8, r2, r6 + umaal r12, r8, r3, r6 + ldm lr!, {r6} + mov r9, #0 + umlal r10, r9, r0, r6 + umaal r11, r9, r1, r6 + str r10, [sp, #24] + umaal r12, r9, r2, r6 + umaal r4, r9, r3, r6 + ldm lr!, {r6} + mov r10, #0 + umlal r11, r10, r0, r6 + umaal r12, r10, r1, r6 + str r11, [sp, #28] + umaal r4, r10, r2, r6 + umaal r5, r10, r3, r6 + ldm lr!, {r11} + umaal r12, r7, r0, r11 + umaal r4, r7, r1, r11 + ldr r6, [sp, #32] + umaal r5, r7, r2, r11 + umaal r6, r7, r3, r11 + ldm lr!, {r11} + umaal r4, r8, r0, r11 + umaal r5, r8, r1, r11 + umaal r6, r8, r2, r11 + umaal r7, r8, r3, r11 + ldm lr, {r11, lr} + umaal r5, r9, r0, r11 + umaal r6, r10, r0, lr + umaal r6, r9, r1, r11 + umaal r7, r10, r1, lr + umaal r7, r9, r2, r11 + umaal r8, r10, r2, lr + umaal r8, r9, r3, r11 + umaal r9, r10, r3, lr # Reduce - # Load bottom half -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp] - ldr r5, [sp, #4] -#else - ldrd r4, r5, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #8] - ldr r7, [sp, #12] -#else - ldrd r6, r7, [sp, #8] -#endif + ldr r0, [sp, #28] + mov lr, #37 + umaal r10, r0, r10, lr + mov lr, #19 + lsl r0, r0, #1 + orr r0, r0, r10, lsr #31 + mul r11, r0, lr + pop {r0, r1, r2} + mov lr, #38 + umaal r0, r11, r12, lr + umaal r1, r11, r4, lr + umaal r2, r11, r5, lr + pop {r3, r4, r5} + umaal r3, r11, r6, lr + umaal r4, r11, r7, lr + umaal r5, r11, r8, lr + pop {r6} + bfc r10, #31, #1 + umaal r6, r11, r9, lr + add r7, r10, r11 + ldr lr, [sp, #8] + # Store + stm lr, {r0, r1, r2, r3, r4, r5, r6, r7} + add sp, sp, #16 + pop {pc} + .size fe_mul_op,.-fe_mul_op + .text + .align 4 + .globl fe_mul + .type fe_mul, %function +fe_mul: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + bl fe_mul_op + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_mul,.-fe_mul + .text + .align 4 + .globl fe_sq_op + .type fe_sq_op, %function +fe_sq_op: + push {lr} + sub sp, sp, #32 + str r0, [sp, #28] + ldm r1, {r0, r1, r2, r3, r4, r5, r6, r7} + # Square + umull r9, r10, r0, r0 + umull r11, r12, r0, r1 + adds r11, r11, r11 + mov lr, #0 + umaal r10, r11, lr, lr + stm sp, {r9, r10} + mov r8, lr + umaal r8, r12, r0, r2 + adcs r8, r8, r8 + umaal r8, r11, r1, r1 + umull r9, r10, r0, r3 + umaal r9, r12, r1, r2 + adcs r9, r9, r9 + umaal r9, r11, lr, lr #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #16] - ldr r9, [sp, #20] + str r8, [sp, #8] + str r9, [sp, #12] #else - ldrd r8, r9, [sp, #16] + strd r8, r9, [sp, #8] #endif + mov r9, lr + umaal r9, r10, r0, r4 + umaal r9, r12, r1, r3 + adcs r9, r9, r9 + umaal r9, r11, r2, r2 + str r9, [sp, #16] + umull r9, r8, r0, r5 + umaal r9, r12, r1, r4 + umaal r9, r10, r2, r3 + adcs r9, r9, r9 + umaal r9, r11, lr, lr + str r9, [sp, #20] + mov r9, lr + umaal r9, r8, r0, r6 + umaal r9, r12, r1, r5 + umaal r9, r10, r2, r4 + adcs r9, r9, r9 + umaal r9, r11, r3, r3 + str r9, [sp, #24] + umull r0, r9, r0, r7 + umaal r0, r8, r1, r6 + umaal r0, r12, r2, r5 + umaal r0, r10, r3, r4 + adcs r0, r0, r0 + umaal r0, r11, lr, lr + # R[7] = r0 + umaal r9, r8, r1, r7 + umaal r9, r10, r2, r6 + umaal r12, r9, r3, r5 + adcs r12, r12, r12 + umaal r12, r11, r4, r4 + # R[8] = r12 + umaal r9, r8, r2, r7 + umaal r10, r9, r3, r6 + mov r2, lr + umaal r10, r2, r4, r5 + adcs r10, r10, r10 + umaal r11, r10, lr, lr + # R[9] = r11 + umaal r2, r8, r3, r7 + umaal r2, r9, r4, r6 + adcs r3, r2, r2 + umaal r10, r3, r5, r5 + # R[10] = r10 + mov r1, lr + umaal r1, r8, r4, r7 + umaal r1, r9, r5, r6 + adcs r4, r1, r1 + umaal r3, r4, lr, lr + # R[11] = r3 + umaal r8, r9, r5, r7 + adcs r8, r8, r8 + umaal r4, r8, r6, r6 + # R[12] = r4 + mov r5, lr + umaal r5, r9, r6, r7 + adcs r5, r5, r5 + umaal r8, r5, lr, lr + # R[13] = r8 + adcs r9, r9, r9 + umaal r9, r5, r7, r7 + adcs r7, r5, lr + # R[14] = r9 + # R[15] = r7 + # Reduce + mov r6, #37 + umaal r7, r0, r7, r6 + mov r6, #19 + lsl r0, r0, #1 + orr r0, r0, r7, lsr #31 + mul lr, r0, r6 + pop {r0, r1} + mov r6, #38 + umaal r0, lr, r12, r6 + umaal r1, lr, r11, r6 + mov r12, r3 + mov r11, r4 + pop {r2, r3, r4} + umaal r2, lr, r10, r6 + umaal r3, lr, r12, r6 + umaal r4, lr, r11, r6 + mov r12, r6 + pop {r5, r6} + umaal r5, lr, r8, r12 + bfc r7, #31, #1 + umaal r6, lr, r9, r12 + add r7, r7, lr + pop {lr} + # Store + stm lr, {r0, r1, r2, r3, r4, r5, r6, r7} + pop {pc} + .size fe_sq_op,.-fe_sq_op + .text + .align 4 + .globl fe_sq + .type fe_sq, %function +fe_sq: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + bl fe_sq_op + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_sq,.-fe_sq + .text + .align 4 + .globl fe_mul121666 + .type fe_mul121666, %function +fe_mul121666: + push {r4, r5, r6, r7, r8, r9, r10, lr} + # Multiply by 121666 + ldm r1, {r2, r3, r4, r5, r6, r7, r8, r9} #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #24] - ldr r11, [sp, #28] + mov lr, #0xdb + lsl lr, lr, #8 + add lr, lr, #0x42 #else - ldrd r10, r11, [sp, #24] + mov lr, #0xdb42 #endif - lsr r3, r11, #31 - and r11, r11, #0x7fffffff + movt lr, #1 + umull r2, r10, r2, lr + sub r12, lr, #1 + umaal r3, r10, r3, r12 + umaal r4, r10, r4, r12 + umaal r5, r10, r5, r12 + umaal r6, r10, r6, r12 + umaal r7, r10, r7, r12 + umaal r8, r10, r8, r12 mov lr, #19 - ldr r1, [sp, #32] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - adds r4, r4, r3 - mov r2, #0 - adcs r5, r5, r12 - adc r2, r2, #0 - lsr r3, r1, #31 - ldr r1, [sp, #36] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - add r12, r12, r2 - adds r5, r5, r3 - mov r2, #0 - adcs r6, r6, r12 - adc r2, r2, #0 - lsr r3, r1, #31 - ldr r1, [sp, #40] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - add r12, r12, r2 - adds r6, r6, r3 - mov r2, #0 - adcs r7, r7, r12 - adc r2, r2, #0 - lsr r3, r1, #31 - ldr r1, [sp, #44] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - add r12, r12, r2 - adds r7, r7, r3 - mov r2, #0 - adcs r8, r8, r12 - adc r2, r2, #0 - lsr r3, r1, #31 - ldr r1, [sp, #48] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - add r12, r12, r2 - adds r8, r8, r3 - mov r2, #0 - adcs r9, r9, r12 - adc r2, r2, #0 - lsr r3, r1, #31 - ldr r1, [sp, #52] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - add r12, r12, r2 - adds r9, r9, r3 - mov r2, #0 - adcs r10, r10, r12 - adc r2, r2, #0 - lsr r3, r1, #31 - ldr r1, [sp, #56] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - add r12, r12, r2 - adds r10, r10, r3 - mov r2, #0 - adcs r11, r11, r12 - adc r2, r2, #0 - lsr r3, r1, #31 - ldr r1, [sp, #60] - orr r3, r3, r1, lsl #1 - umull r3, r12, lr, r3 - adds r11, r11, r3 - adc r3, r12, r2 - # Overflow - lsl r3, r3, #1 - orr r3, r3, r11, lsr #31 - mul r3, r3, lr - and r11, r11, #0x7fffffff - adds r4, r4, r3 - adcs r5, r5, #0 - adcs r6, r6, #0 - adcs r7, r7, #0 - adcs r8, r8, #0 - adcs r9, r9, #0 - adcs r10, r10, #0 - adc r11, r11, #0 - # Reduce if top bit set - asr r3, r11, #31 - and r3, r3, lr - and r11, r11, #0x7fffffff - adds r4, r4, r3 + umaal r9, r10, r9, r12 + lsl r10, r10, #1 + orr r10, r10, r9, lsr #31 + mul r10, r10, lr + adds r2, r2, r10 + adcs r3, r3, #0 + adcs r4, r4, #0 adcs r5, r5, #0 adcs r6, r6, #0 adcs r7, r7, #0 + bfc r9, #31, #1 adcs r8, r8, #0 - adcs r9, r9, #0 - adcs r10, r10, #0 - adc r11, r11, #0 - # Store + adc r9, r9, #0 + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size fe_mul121666,.-fe_mul121666 +#ifndef WC_NO_CACHE_RESISTANT + .text + .align 4 + .globl curve25519 + .type curve25519, %function +curve25519: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0xbc + str r0, [sp, #160] + str r1, [sp, #164] + str r2, [sp, #168] + mov r1, #0 + str r1, [sp, #172] + # Set one + mov r10, #1 + mov r11, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + str r10, [r0] + str r11, [r0, #4] #else - strd r4, r5, [r0] + strd r10, r11, [r0] #endif + mov r10, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] + str r10, [r0, #8] + str r11, [r0, #12] #else - strd r6, r7, [r0, #8] + strd r10, r11, [r0, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] + str r10, [r0, #16] + str r11, [r0, #20] #else - strd r8, r9, [r0, #16] + strd r10, r11, [r0, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) str r10, [r0, #24] @@ -2986,544 +2753,135 @@ fe_mul: #else strd r10, r11, [r0, #24] #endif - add sp, sp, #0x40 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_mul,.-fe_mul - .text - .align 4 - .globl fe_sq - .type fe_sq, %function -fe_sq: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x40 - # Square - ldr r7, [r1] - ldr r8, [r1, #4] - ldr r9, [r1, #8] - ldr r10, [r1, #12] - ldr r12, [r1, #16] - # A[0] * A[0] = 0 - umull r4, r5, r7, r7 - str r4, [sp] - # A[0] * A[1] = 1 - umull r2, r3, r7, r8 - mov r6, #0 - adds r5, r5, r2 - adc r6, r6, r3 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #4] - # A[1] * A[1] = 2 - umull r2, r3, r8, r8 - adds r6, r6, r2 - adc r4, r4, r3 - # A[0] * A[2] = 2 - umull r2, r3, r7, r9 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #8] - # A[0] * A[3] = 3 - umull r2, r3, r7, r10 - adds r4, r4, r2 - adc r5, r5, r3 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[1] * A[2] = 3 - umull r2, r3, r8, r9 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #12] - # A[2] * A[2] = 4 - umull r2, r3, r9, r9 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[1] * A[3] = 4 - umull r2, r3, r8, r10 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[0] * A[4] = 4 - umull r2, r3, r7, r12 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #16] - # A[0] * A[5] = 5 - ldr r11, [r1, #20] - umull r2, r3, r7, r11 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[1] * A[4] = 5 - umull r2, r3, r8, r12 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[2] * A[3] = 5 - umull r2, r3, r9, r10 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #20] - # A[3] * A[3] = 6 - umull r2, r3, r10, r10 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[2] * A[4] = 6 - umull r2, r3, r9, r12 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[1] * A[5] = 6 - umull r2, r3, r8, r11 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[0] * A[6] = 6 - ldr r11, [r1, #24] - umull r2, r3, r7, r11 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #24] - # A[0] * A[7] = 7 - ldr r11, [r1, #28] - umull r2, r3, r7, r11 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[1] * A[6] = 7 - ldr r11, [r1, #24] - umull r2, r3, r8, r11 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[2] * A[5] = 7 - ldr r11, [r1, #20] - umull r2, r3, r9, r11 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[3] * A[4] = 7 - umull r2, r3, r10, r12 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #28] - # A[4] * A[4] = 8 - umull r2, r3, r12, r12 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[3] * A[5] = 8 - umull r2, r3, r10, r11 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[2] * A[6] = 8 - ldr r11, [r1, #24] - umull r2, r3, r9, r11 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[1] * A[7] = 8 - ldr r11, [r1, #28] - umull r2, r3, r8, r11 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #32] - ldr r7, [r1, #20] - # A[2] * A[7] = 9 - umull r2, r3, r9, r11 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[3] * A[6] = 9 - ldr r11, [r1, #24] - umull r2, r3, r10, r11 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[4] * A[5] = 9 - umull r2, r3, r12, r7 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #36] - mov r8, r11 - # A[5] * A[5] = 10 - umull r2, r3, r7, r7 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[4] * A[6] = 10 - umull r2, r3, r12, r8 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[3] * A[7] = 10 - ldr r11, [r1, #28] - umull r2, r3, r10, r11 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #40] - mov r9, r11 - # A[4] * A[7] = 11 - umull r2, r3, r12, r9 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[5] * A[6] = 11 - umull r2, r3, r7, r8 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #44] - # A[6] * A[6] = 12 - umull r2, r3, r8, r8 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[5] * A[7] = 12 - umull r2, r3, r7, r9 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #48] - # A[6] * A[7] = 13 - umull r2, r3, r8, r9 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #52] - # A[7] * A[7] = 14 - umull r2, r3, r9, r9 - adds r6, r6, r2 - adc r4, r4, r3 - str r6, [sp, #56] - str r4, [sp, #60] - # Reduce - # Load bottom half + # Set zero + mov r10, #0 + mov r11, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp] - ldr r5, [sp, #4] + str r10, [sp] + str r11, [sp, #4] #else - ldrd r4, r5, [sp] + strd r10, r11, [sp] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #8] - ldr r7, [sp, #12] + str r10, [sp, #8] + str r11, [sp, #12] #else - ldrd r6, r7, [sp, #8] + strd r10, r11, [sp, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #16] - ldr r9, [sp, #20] + str r10, [sp, #16] + str r11, [sp, #20] #else - ldrd r8, r9, [sp, #16] + strd r10, r11, [sp, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #24] - ldr r11, [sp, #28] + str r10, [sp, #24] + str r11, [sp, #28] #else - ldrd r10, r11, [sp, #24] + strd r10, r11, [sp, #24] #endif - lsr r2, r11, #31 - and r11, r11, #0x7fffffff - mov r12, #19 - ldr r1, [sp, #32] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - adds r4, r4, r2 - mov lr, #0 - adcs r5, r5, r3 - adc lr, lr, #0 - lsr r2, r1, #31 - ldr r1, [sp, #36] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r5, r5, r2 - mov lr, #0 - adcs r6, r6, r3 - adc lr, lr, #0 - lsr r2, r1, #31 - ldr r1, [sp, #40] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r6, r6, r2 - mov lr, #0 - adcs r7, r7, r3 - adc lr, lr, #0 - lsr r2, r1, #31 - ldr r1, [sp, #44] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r7, r7, r2 - mov lr, #0 - adcs r8, r8, r3 - adc lr, lr, #0 - lsr r2, r1, #31 - ldr r1, [sp, #48] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r8, r8, r2 - mov lr, #0 - adcs r9, r9, r3 - adc lr, lr, #0 - lsr r2, r1, #31 - ldr r1, [sp, #52] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r9, r9, r2 - mov lr, #0 - adcs r10, r10, r3 - adc lr, lr, #0 - lsr r2, r1, #31 - ldr r1, [sp, #56] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r10, r10, r2 - mov lr, #0 - adcs r11, r11, r3 - adc lr, lr, #0 - lsr r2, r1, #31 - ldr r1, [sp, #60] - orr r2, r2, r1, lsl #1 - umull r2, r3, r12, r2 - adds r11, r11, r2 - adc r2, r3, lr - # Overflow - lsl r2, r2, #1 - orr r2, r2, r11, lsr #31 - mul r2, r2, r12 - and r11, r11, #0x7fffffff - adds r4, r4, r2 - adcs r5, r5, #0 - adcs r6, r6, #0 - adcs r7, r7, #0 - adcs r8, r8, #0 - adcs r9, r9, #0 - adcs r10, r10, #0 - adc r11, r11, #0 - # Reduce if top bit set - asr r2, r11, #31 - and r2, r2, r12 - and r11, r11, #0x7fffffff - adds r4, r4, r2 - adcs r5, r5, #0 - adcs r6, r6, #0 - adcs r7, r7, #0 - adcs r8, r8, #0 - adcs r9, r9, #0 - adcs r10, r10, #0 - adc r11, r11, #0 - # Store + # Set one + mov r10, #1 + mov r11, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + str r10, [sp, #32] + str r11, [sp, #36] #else - strd r4, r5, [r0] + strd r10, r11, [sp, #32] #endif + mov r10, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] + str r10, [sp, #40] + str r11, [sp, #44] #else - strd r6, r7, [r0, #8] + strd r10, r11, [sp, #40] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] + str r10, [sp, #48] + str r11, [sp, #52] #else - strd r8, r9, [r0, #16] + strd r10, r11, [sp, #48] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] + str r10, [sp, #56] + str r11, [sp, #60] #else - strd r10, r11, [r0, #24] + strd r10, r11, [sp, #56] #endif - add sp, sp, #0x40 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_sq,.-fe_sq - .text - .align 4 - .globl fe_mul121666 - .type fe_mul121666, %function -fe_mul121666: - push {r4, r5, r6, r7, r8, r9, r10, lr} - # Multiply by 121666 + add r3, sp, #0x40 + # Copy + ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11} + stm r3, {r4, r5, r6, r7, r8, r9, r10, r11} + mov r1, #30 + str r1, [sp, #180] + mov r2, #28 + str r2, [sp, #176] +L_curve25519_words: +L_curve25519_bits: + ldr r1, [sp, #164] + ldr r2, [r1, r2] + ldr r1, [sp, #180] + lsr r2, r2, r1 + and r2, r2, #1 + str r2, [sp, #184] + ldr r1, [sp, #172] + eor r1, r1, r2 + str r1, [sp, #172] + ldr r0, [sp, #160] + # Conditional Swap + rsb r1, r1, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r2, [r1] - ldr r3, [r1, #4] + ldr r4, [r0] + ldr r5, [r0, #4] #else - ldrd r2, r3, [r1] + ldrd r4, r5, [r0] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] + ldr r6, [sp, #64] + ldr r7, [sp, #68] #else - ldrd r4, r5, [r1, #8] + ldrd r6, r7, [sp, #64] #endif + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #16] - ldr r7, [r1, #20] + str r4, [r0] + str r5, [r0, #4] #else - ldrd r6, r7, [r1, #16] + strd r4, r5, [r0] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1, #24] - ldr r9, [r1, #28] + str r6, [sp, #64] + str r7, [sp, #68] #else - ldrd r8, r9, [r1, #24] + strd r6, r7, [sp, #64] #endif - movw lr, #0xdb42 - movt lr, #1 - umull r2, r10, r2, lr - umull r3, r12, r3, lr - adds r3, r3, r10 - adc r10, r12, #0 - umull r4, r12, r4, lr - adds r4, r4, r10 - adc r10, r12, #0 - umull r5, r12, r5, lr - adds r5, r5, r10 - adc r10, r12, #0 - umull r6, r12, r6, lr - adds r6, r6, r10 - adc r10, r12, #0 - umull r7, r12, r7, lr - adds r7, r7, r10 - adc r10, r12, #0 - umull r8, r12, r8, lr - adds r8, r8, r10 - adc r10, r12, #0 - umull r9, r12, r9, lr - adds r9, r9, r10 - adc r10, r12, #0 - mov lr, #19 - lsl r10, r10, #1 - orr r10, r10, r9, lsr #31 - mul r10, r10, lr - and r9, r9, #0x7fffffff - adds r2, r2, r10 - adcs r3, r3, #0 - adcs r4, r4, #0 - adcs r5, r5, #0 - adcs r6, r6, #0 - adcs r7, r7, #0 - adcs r8, r8, #0 - adc r9, r9, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r2, [r0] - str r3, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] #else - strd r2, r3, [r0] + ldrd r4, r5, [r0, #8] +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + ldr r6, [sp, #72] + ldr r7, [sp, #76] +#else + ldrd r6, r7, [sp, #72] #endif + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] @@ -3531,850 +2889,121 @@ fe_mul121666: strd r4, r5, [r0, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #16] - str r7, [r0, #20] + str r6, [sp, #72] + str r7, [sp, #76] #else - strd r6, r7, [r0, #16] + strd r6, r7, [sp, #72] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] + ldr r4, [r0, #16] + ldr r5, [r0, #20] #else - strd r8, r9, [r0, #24] + ldrd r4, r5, [r0, #16] #endif - pop {r4, r5, r6, r7, r8, r9, r10, pc} - .size fe_mul121666,.-fe_mul121666 - .text - .align 4 - .globl fe_sq2 - .type fe_sq2, %function -fe_sq2: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x40 - # Square * 2 - ldr r7, [r1] - ldr r8, [r1, #4] - ldr r9, [r1, #8] - ldr r10, [r1, #12] - ldr r12, [r1, #16] - # A[0] * A[0] = 0 - umull r4, r5, r7, r7 - str r4, [sp] - # A[0] * A[1] = 1 - umull r2, r3, r7, r8 - mov r6, #0 - adds r5, r5, r2 - adc r6, r6, r3 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #4] - # A[1] * A[1] = 2 - umull r2, r3, r8, r8 - adds r6, r6, r2 - adc r4, r4, r3 - # A[0] * A[2] = 2 - umull r2, r3, r7, r9 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #8] - # A[0] * A[3] = 3 - umull r2, r3, r7, r10 - adds r4, r4, r2 - adc r5, r5, r3 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[1] * A[2] = 3 - umull r2, r3, r8, r9 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #12] - # A[2] * A[2] = 4 - umull r2, r3, r9, r9 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[1] * A[3] = 4 - umull r2, r3, r8, r10 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[0] * A[4] = 4 - umull r2, r3, r7, r12 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #16] - # A[0] * A[5] = 5 - ldr r11, [r1, #20] - umull r2, r3, r7, r11 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[1] * A[4] = 5 - umull r2, r3, r8, r12 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[2] * A[3] = 5 - umull r2, r3, r9, r10 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #20] - # A[3] * A[3] = 6 - umull r2, r3, r10, r10 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[2] * A[4] = 6 - umull r2, r3, r9, r12 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[1] * A[5] = 6 - umull r2, r3, r8, r11 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[0] * A[6] = 6 - ldr r11, [r1, #24] - umull r2, r3, r7, r11 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #24] - # A[0] * A[7] = 7 - ldr r11, [r1, #28] - umull r2, r3, r7, r11 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[1] * A[6] = 7 - ldr r11, [r1, #24] - umull r2, r3, r8, r11 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[2] * A[5] = 7 - ldr r11, [r1, #20] - umull r2, r3, r9, r11 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[3] * A[4] = 7 - umull r2, r3, r10, r12 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #28] - # A[4] * A[4] = 8 - umull r2, r3, r12, r12 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[3] * A[5] = 8 - umull r2, r3, r10, r11 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[2] * A[6] = 8 - ldr r11, [r1, #24] - umull r2, r3, r9, r11 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[1] * A[7] = 8 - ldr r11, [r1, #28] - umull r2, r3, r8, r11 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #32] - ldr r7, [r1, #20] - # A[2] * A[7] = 9 - umull r2, r3, r9, r11 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[3] * A[6] = 9 - ldr r11, [r1, #24] - umull r2, r3, r10, r11 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[4] * A[5] = 9 - umull r2, r3, r12, r7 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #36] - mov r8, r11 - # A[5] * A[5] = 10 - umull r2, r3, r7, r7 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[4] * A[6] = 10 - umull r2, r3, r12, r8 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - # A[3] * A[7] = 10 - ldr r11, [r1, #28] - umull r2, r3, r10, r11 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #40] - mov r9, r11 - # A[4] * A[7] = 11 - umull r2, r3, r12, r9 - adds r6, r6, r2 - mov r5, #0 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - # A[5] * A[6] = 11 - umull r2, r3, r7, r8 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - adds r6, r6, r2 - adcs r4, r4, r3 - adc r5, r5, #0 - str r6, [sp, #44] - # A[6] * A[6] = 12 - umull r2, r3, r8, r8 - adds r4, r4, r2 - mov r6, #0 - adcs r5, r5, r3 - adc r6, r6, #0 - # A[5] * A[7] = 12 - umull r2, r3, r7, r9 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - adds r4, r4, r2 - adcs r5, r5, r3 - adc r6, r6, #0 - str r4, [sp, #48] - # A[6] * A[7] = 13 - umull r2, r3, r8, r9 - adds r5, r5, r2 - mov r4, #0 - adcs r6, r6, r3 - adc r4, r4, #0 - adds r5, r5, r2 - adcs r6, r6, r3 - adc r4, r4, #0 - str r5, [sp, #52] - # A[7] * A[7] = 14 - umull r2, r3, r9, r9 - adds r6, r6, r2 - adc r4, r4, r3 - str r6, [sp, #56] - str r4, [sp, #60] - # Double and Reduce - # Load bottom half #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp] - ldr r5, [sp, #4] + ldr r6, [sp, #80] + ldr r7, [sp, #84] #else - ldrd r4, r5, [sp] + ldrd r6, r7, [sp, #80] #endif + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #8] - ldr r7, [sp, #12] + str r4, [r0, #16] + str r5, [r0, #20] #else - ldrd r6, r7, [sp, #8] + strd r4, r5, [r0, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #16] - ldr r9, [sp, #20] + str r6, [sp, #80] + str r7, [sp, #84] #else - ldrd r8, r9, [sp, #16] + strd r6, r7, [sp, #80] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #24] - ldr r11, [sp, #28] + ldr r4, [r0, #24] + ldr r5, [r0, #28] #else - ldrd r10, r11, [sp, #24] + ldrd r4, r5, [r0, #24] #endif - lsr r2, r11, #30 - lsl r11, r11, #1 - orr r11, r11, r10, lsr #31 - lsl r10, r10, #1 - orr r10, r10, r9, lsr #31 - lsl r9, r9, #1 - orr r9, r9, r8, lsr #31 - lsl r8, r8, #1 - orr r8, r8, r7, lsr #31 - lsl r7, r7, #1 - orr r7, r7, r6, lsr #31 - lsl r6, r6, #1 - orr r6, r6, r5, lsr #31 - lsl r5, r5, #1 - orr r5, r5, r4, lsr #31 - lsl r4, r4, #1 - and r11, r11, #0x7fffffff - mov r12, #19 - ldr r1, [sp, #32] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - adds r4, r4, r2 - mov lr, #0 - adcs r5, r5, r3 - adc lr, lr, #0 - lsr r2, r1, #30 - ldr r1, [sp, #36] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r5, r5, r2 - mov lr, #0 - adcs r6, r6, r3 - adc lr, lr, #0 - lsr r2, r1, #30 - ldr r1, [sp, #40] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r6, r6, r2 - mov lr, #0 - adcs r7, r7, r3 - adc lr, lr, #0 - lsr r2, r1, #30 - ldr r1, [sp, #44] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r7, r7, r2 - mov lr, #0 - adcs r8, r8, r3 - adc lr, lr, #0 - lsr r2, r1, #30 - ldr r1, [sp, #48] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r8, r8, r2 - mov lr, #0 - adcs r9, r9, r3 - adc lr, lr, #0 - lsr r2, r1, #30 - ldr r1, [sp, #52] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r9, r9, r2 - mov lr, #0 - adcs r10, r10, r3 - adc lr, lr, #0 - lsr r2, r1, #30 - ldr r1, [sp, #56] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - add r3, r3, lr - adds r10, r10, r2 - mov lr, #0 - adcs r11, r11, r3 - adc lr, lr, #0 - lsr r2, r1, #30 - ldr r1, [sp, #60] - orr r2, r2, r1, lsl #2 - umull r2, r3, r12, r2 - adds r11, r11, r2 - adc r2, r3, lr - # Overflow - lsl r2, r2, #1 - orr r2, r2, r11, lsr #31 - mul r2, r2, r12 - and r11, r11, #0x7fffffff - adds r4, r4, r2 - adcs r5, r5, #0 - adcs r6, r6, #0 - adcs r7, r7, #0 - adcs r8, r8, #0 - adcs r9, r9, #0 - adcs r10, r10, #0 - adc r11, r11, #0 - # Reduce if top bit set - asr r2, r11, #31 - and r2, r2, r12 - and r11, r11, #0x7fffffff - adds r4, r4, r2 - adcs r5, r5, #0 - adcs r6, r6, #0 - adcs r7, r7, #0 - adcs r8, r8, #0 - adcs r9, r9, #0 - adcs r10, r10, #0 - adc r11, r11, #0 - # Store #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + ldr r6, [sp, #88] + ldr r7, [sp, #92] #else - strd r4, r5, [r0] + ldrd r6, r7, [sp, #88] #endif + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] + str r4, [r0, #24] + str r5, [r0, #28] #else - strd r6, r7, [r0, #8] + strd r4, r5, [r0, #24] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] + str r6, [sp, #88] + str r7, [sp, #92] #else - strd r8, r9, [r0, #16] + strd r6, r7, [sp, #88] #endif + ldr r1, [sp, #172] + # Conditional Swap + rsb r1, r1, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] + ldr r4, [sp] + ldr r5, [sp, #4] #else - strd r10, r11, [r0, #24] + ldrd r4, r5, [sp] #endif - add sp, sp, #0x40 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_sq2,.-fe_sq2 - .text - .align 4 - .globl fe_invert - .type fe_invert, %function -fe_invert: - push {r4, lr} - sub sp, sp, #0x88 - # Invert - str r0, [sp, #128] - str r1, [sp, #132] - mov r0, sp - ldr r1, [sp, #132] - bl fe_sq - add r0, sp, #32 - mov r1, sp - bl fe_sq - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - add r0, sp, #32 - ldr r1, [sp, #132] - add r2, sp, #32 - bl fe_mul - mov r0, sp - mov r1, sp - add r2, sp, #32 - bl fe_mul - add r0, sp, #0x40 - mov r1, sp - bl fe_sq - add r0, sp, #32 - add r1, sp, #32 - add r2, sp, #0x40 - bl fe_mul - add r0, sp, #0x40 - add r1, sp, #32 - bl fe_sq - mov r4, #4 -L_fe_invert1: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert1 - add r0, sp, #32 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - add r0, sp, #0x40 - add r1, sp, #32 - bl fe_sq - mov r4, #9 -L_fe_invert2: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert2 - add r0, sp, #0x40 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - add r0, sp, #0x60 - add r1, sp, #0x40 - bl fe_sq - mov r4, #19 -L_fe_invert3: - add r0, sp, #0x60 - add r1, sp, #0x60 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert3 - add r0, sp, #0x40 - add r1, sp, #0x60 - add r2, sp, #0x40 - bl fe_mul - mov r4, #10 -L_fe_invert4: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert4 - add r0, sp, #32 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - add r0, sp, #0x40 - add r1, sp, #32 - bl fe_sq - mov r4, #49 -L_fe_invert5: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert5 - add r0, sp, #0x40 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - add r0, sp, #0x60 - add r1, sp, #0x40 - bl fe_sq - mov r4, #0x63 -L_fe_invert6: - add r0, sp, #0x60 - add r1, sp, #0x60 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert6 - add r0, sp, #0x40 - add r1, sp, #0x60 - add r2, sp, #0x40 - bl fe_mul - mov r4, #50 -L_fe_invert7: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert7 - add r0, sp, #32 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - mov r4, #5 -L_fe_invert8: - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_invert8 - ldr r0, [sp, #128] - add r1, sp, #32 - mov r2, sp - bl fe_mul - ldr r1, [sp, #132] - ldr r0, [sp, #128] - add sp, sp, #0x88 - pop {r4, pc} - .size fe_invert,.-fe_invert - .text - .align 4 - .globl curve25519 - .type curve25519, %function -curve25519: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0xbc - str r0, [sp, #160] - str r1, [sp, #164] - str r2, [sp, #168] - mov r1, #0 - str r1, [sp, #172] - # Set one - mov r10, #1 - mov r11, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0] - str r11, [r0, #4] + ldr r6, [sp, #32] + ldr r7, [sp, #36] #else - strd r10, r11, [r0] + ldrd r6, r7, [sp, #32] #endif - mov r10, #0 + eor r8, r4, r6 + eor r9, r5, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r8 + eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #16] - str r11, [r0, #20] -#else - strd r10, r11, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - # Set zero - mov r10, #0 - mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp] - str r11, [sp, #4] -#else - strd r10, r11, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #8] - str r11, [sp, #12] -#else - strd r10, r11, [sp, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #16] - str r11, [sp, #20] -#else - strd r10, r11, [sp, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #24] - str r11, [sp, #28] -#else - strd r10, r11, [sp, #24] -#endif - # Set one - mov r10, #1 - mov r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #32] - str r11, [sp, #36] -#else - strd r10, r11, [sp, #32] -#endif - mov r10, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #40] - str r11, [sp, #44] -#else - strd r10, r11, [sp, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #48] - str r11, [sp, #52] -#else - strd r10, r11, [sp, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #56] - str r11, [sp, #60] -#else - strd r10, r11, [sp, #56] -#endif - # Copy -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2] - ldr r5, [r2, #4] -#else - ldrd r4, r5, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r2, #8] - ldr r7, [r2, #12] -#else - ldrd r6, r7, [r2, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #64] - str r5, [sp, #68] -#else - strd r4, r5, [sp, #64] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #72] - str r7, [sp, #76] -#else - strd r6, r7, [sp, #72] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #16] - ldr r5, [r2, #20] -#else - ldrd r4, r5, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r2, #24] - ldr r7, [r2, #28] -#else - ldrd r6, r7, [r2, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #80] - str r5, [sp, #84] -#else - strd r4, r5, [sp, #80] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #88] - str r7, [sp, #92] -#else - strd r6, r7, [sp, #88] -#endif - mov r1, #30 - str r1, [sp, #180] - mov r2, #28 - str r2, [sp, #176] -L_curve25519_words: -L_curve25519_bits: - ldr r1, [sp, #164] - ldr r2, [r1, r2] - ldr r1, [sp, #180] - lsr r2, r2, r1 - and r2, r2, #1 - str r2, [sp, #184] - ldr r1, [sp, #172] - eor r1, r1, r2 - str r1, [sp, #172] - ldr r0, [sp, #160] - # Conditional Swap - neg r1, r1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #64] - ldr r7, [sp, #68] -#else - ldrd r6, r7, [sp, #64] -#endif - eor r8, r4, r6 - eor r9, r5, r7 - and r8, r8, r1 - and r9, r9, r1 - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r8 - eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + str r4, [sp] + str r5, [sp, #4] #else - strd r4, r5, [r0] + strd r4, r5, [sp] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #64] - str r7, [sp, #68] + str r6, [sp, #32] + str r7, [sp, #36] #else - strd r6, r7, [sp, #64] + strd r6, r7, [sp, #32] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] + ldr r4, [sp, #8] + ldr r5, [sp, #12] #else - ldrd r4, r5, [r0, #8] + ldrd r4, r5, [sp, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #72] - ldr r7, [sp, #76] + ldr r6, [sp, #40] + ldr r7, [sp, #44] #else - ldrd r6, r7, [sp, #72] + ldrd r6, r7, [sp, #40] #endif eor r8, r4, r6 eor r9, r5, r7 @@ -4385,28 +3014,28 @@ L_curve25519_bits: eor r6, r6, r8 eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] + str r4, [sp, #8] + str r5, [sp, #12] #else - strd r4, r5, [r0, #8] + strd r4, r5, [sp, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #72] - str r7, [sp, #76] + str r6, [sp, #40] + str r7, [sp, #44] #else - strd r6, r7, [sp, #72] + strd r6, r7, [sp, #40] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] + ldr r4, [sp, #16] + ldr r5, [sp, #20] #else - ldrd r4, r5, [r0, #16] + ldrd r4, r5, [sp, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #80] - ldr r7, [sp, #84] + ldr r6, [sp, #48] + ldr r7, [sp, #52] #else - ldrd r6, r7, [sp, #80] + ldrd r6, r7, [sp, #48] #endif eor r8, r4, r6 eor r9, r5, r7 @@ -4417,28 +3046,28 @@ L_curve25519_bits: eor r6, r6, r8 eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] + str r4, [sp, #16] + str r5, [sp, #20] #else - strd r4, r5, [r0, #16] + strd r4, r5, [sp, #16] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #80] - str r7, [sp, #84] + str r6, [sp, #48] + str r7, [sp, #52] #else - strd r6, r7, [sp, #80] + strd r6, r7, [sp, #48] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #24] - ldr r5, [r0, #28] + ldr r4, [sp, #24] + ldr r5, [sp, #28] #else - ldrd r4, r5, [r0, #24] + ldrd r4, r5, [sp, #24] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #88] - ldr r7, [sp, #92] + ldr r6, [sp, #56] + ldr r7, [sp, #60] #else - ldrd r6, r7, [sp, #88] + ldrd r6, r7, [sp, #56] #endif eor r8, r4, r6 eor r9, r5, r7 @@ -4449,4997 +3078,1228 @@ L_curve25519_bits: eor r6, r6, r8 eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #24] - str r5, [r0, #28] -#else - strd r4, r5, [r0, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #88] - str r7, [sp, #92] -#else - strd r6, r7, [sp, #88] -#endif - ldr r1, [sp, #172] - # Conditional Swap - neg r1, r1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp] - ldr r5, [sp, #4] + str r4, [sp, #24] + str r5, [sp, #28] #else - ldrd r4, r5, [sp] + strd r4, r5, [sp, #24] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #32] - ldr r7, [sp, #36] + str r6, [sp, #56] + str r7, [sp, #60] #else - ldrd r6, r7, [sp, #32] -#endif - eor r8, r4, r6 - eor r9, r5, r7 - and r8, r8, r1 - and r9, r9, r1 - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r8 - eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp] - str r5, [sp, #4] -#else - strd r4, r5, [sp] + strd r6, r7, [sp, #56] #endif + ldr r1, [sp, #184] + str r1, [sp, #172] + mov r3, sp + ldr r2, [sp, #160] + add r1, sp, #0x80 + ldr r0, [sp, #160] + bl fe_add_sub_op + add r3, sp, #32 + add r2, sp, #0x40 + add r1, sp, #0x60 + mov r0, sp + bl fe_add_sub_op + ldr r2, [sp, #160] + add r1, sp, #0x60 + add r0, sp, #32 + bl fe_mul_op + add r2, sp, #0x80 + mov r1, sp + mov r0, sp + bl fe_mul_op + add r1, sp, #0x80 + add r0, sp, #0x80 + bl fe_sq_op + ldr r1, [sp, #160] + add r0, sp, #0x60 + bl fe_sq_op + mov r3, sp + add r2, sp, #32 + mov r1, sp + add r0, sp, #0x40 + bl fe_add_sub_op + add r2, sp, #0x80 + add r1, sp, #0x60 + ldr r0, [sp, #160] + bl fe_mul_op + add r2, sp, #0x80 + add r1, sp, #0x60 + add r0, sp, #0x60 + bl fe_sub_op + mov r1, sp + mov r0, sp + bl fe_sq_op + add r1, sp, #0x60 + add r0, sp, #32 + bl fe_mul121666 + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_sq_op + add r2, sp, #32 + add r1, sp, #0x80 + add r0, sp, #0x80 + bl fe_add_op + mov r2, sp + ldr r1, [sp, #168] + add r0, sp, #32 + bl fe_mul_op + add r2, sp, #0x80 + add r1, sp, #0x60 + mov r0, sp + bl fe_mul_op + ldr r2, [sp, #176] + ldr r1, [sp, #180] + subs r1, r1, #1 + str r1, [sp, #180] + bge L_curve25519_bits + mov r1, #31 + str r1, [sp, #180] + subs r2, r2, #4 + str r2, [sp, #176] + bge L_curve25519_words + # Invert + add r1, sp, #0 + add r0, sp, #32 + bl fe_sq_op + add r1, sp, #32 + add r0, sp, #0x40 + bl fe_sq_op + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_sq_op + add r2, sp, #0x40 + add r1, sp, #0 + add r0, sp, #0x40 + bl fe_mul_op + add r2, sp, #0x40 + add r1, sp, #32 + add r0, sp, #32 + bl fe_mul_op + add r1, sp, #32 + add r0, sp, #0x60 + bl fe_sq_op + add r2, sp, #0x60 + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #4 +L_curve25519_inv_1: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_1 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #9 +L_curve25519_inv_2: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_2 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x60 + bl fe_mul_op + add r1, sp, #0x60 + add r0, sp, #0x80 + bl fe_sq_op + mov r12, #19 +L_curve25519_inv_3: + add r1, sp, #0x80 + add r0, sp, #0x80 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_3 + add r2, sp, #0x60 + add r1, sp, #0x80 + add r0, sp, #0x60 + bl fe_mul_op + mov r12, #10 +L_curve25519_inv_4: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_4 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #49 +L_curve25519_inv_5: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_5 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x60 + bl fe_mul_op + add r1, sp, #0x60 + add r0, sp, #0x80 + bl fe_sq_op + mov r12, #0x63 +L_curve25519_inv_6: + add r1, sp, #0x80 + add r0, sp, #0x80 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_6 + add r2, sp, #0x60 + add r1, sp, #0x80 + add r0, sp, #0x60 + bl fe_mul_op + mov r12, #50 +L_curve25519_inv_7: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_7 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + mov r12, #5 +L_curve25519_inv_8: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_8 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #0 + bl fe_mul_op + mov r2, sp + ldr r1, [sp, #160] + ldr r0, [sp, #160] + bl fe_mul_op + mov r0, #0 + add sp, sp, #0xbc + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size curve25519,.-curve25519 +#else + .text + .align 4 + .globl curve25519 + .type curve25519, %function +curve25519: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0xc0 + str r0, [sp, #176] + str r1, [sp, #160] + str r2, [sp, #172] + add r5, sp, #0x40 + add r4, sp, #32 + str sp, [sp, #184] + str r5, [sp, #180] + str r4, [sp, #188] + mov r1, #0 + str r1, [sp, #164] + # Set one + mov r10, #1 + mov r11, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #32] - str r7, [sp, #36] + str r10, [r0] + str r11, [r0, #4] #else - strd r6, r7, [sp, #32] + strd r10, r11, [r0] #endif + mov r10, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #8] - ldr r5, [sp, #12] + str r10, [r0, #8] + str r11, [r0, #12] #else - ldrd r4, r5, [sp, #8] + strd r10, r11, [r0, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #40] - ldr r7, [sp, #44] + str r10, [r0, #16] + str r11, [r0, #20] #else - ldrd r6, r7, [sp, #40] + strd r10, r11, [r0, #16] #endif - eor r8, r4, r6 - eor r9, r5, r7 - and r8, r8, r1 - and r9, r9, r1 - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r8 - eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #8] - str r5, [sp, #12] + str r10, [r0, #24] + str r11, [r0, #28] #else - strd r4, r5, [sp, #8] + strd r10, r11, [r0, #24] #endif + # Set zero + mov r10, #0 + mov r11, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #40] - str r7, [sp, #44] + str r10, [sp] + str r11, [sp, #4] #else - strd r6, r7, [sp, #40] + strd r10, r11, [sp] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #16] - ldr r5, [sp, #20] + str r10, [sp, #8] + str r11, [sp, #12] #else - ldrd r4, r5, [sp, #16] + strd r10, r11, [sp, #8] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #48] - ldr r7, [sp, #52] + str r10, [sp, #16] + str r11, [sp, #20] #else - ldrd r6, r7, [sp, #48] + strd r10, r11, [sp, #16] #endif - eor r8, r4, r6 - eor r9, r5, r7 - and r8, r8, r1 - and r9, r9, r1 - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r8 - eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #16] - str r5, [sp, #20] + str r10, [sp, #24] + str r11, [sp, #28] #else - strd r4, r5, [sp, #16] + strd r10, r11, [sp, #24] #endif + # Set one + mov r10, #1 + mov r11, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #48] - str r7, [sp, #52] + str r10, [sp, #32] + str r11, [sp, #36] #else - strd r6, r7, [sp, #48] + strd r10, r11, [sp, #32] #endif + mov r10, #0 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #24] - ldr r5, [sp, #28] + str r10, [sp, #40] + str r11, [sp, #44] #else - ldrd r4, r5, [sp, #24] + strd r10, r11, [sp, #40] #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #56] - ldr r7, [sp, #60] + str r10, [sp, #48] + str r11, [sp, #52] #else - ldrd r6, r7, [sp, #56] + strd r10, r11, [sp, #48] #endif - eor r8, r4, r6 - eor r9, r5, r7 - and r8, r8, r1 - and r9, r9, r1 - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r8 - eor r7, r7, r9 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #24] - str r5, [sp, #28] -#else - strd r4, r5, [sp, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #56] - str r7, [sp, #60] + str r10, [sp, #56] + str r11, [sp, #60] #else - strd r6, r7, [sp, #56] + strd r10, r11, [sp, #56] #endif + add r3, sp, #0x40 + # Copy + ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11} + stm r3, {r4, r5, r6, r7, r8, r9, r10, r11} + mov r2, #0xfe +L_curve25519_bits: + str r2, [sp, #168] + ldr r1, [sp, #160] + and r4, r2, #31 + lsr r2, r2, #5 + ldr r2, [r1, r2, lsl #2] + rsb r4, r4, #31 + lsl r2, r2, r4 + ldr r1, [sp, #164] + eor r1, r1, r2 + asr r1, r1, #31 + str r2, [sp, #164] + # Conditional Swap + add r11, sp, #0xb0 + ldm r11, {r4, r5, r6, r7} + eor r8, r4, r5 + eor r9, r6, r7 + and r8, r8, r1 + and r9, r9, r1 + eor r4, r4, r8 + eor r5, r5, r8 + eor r6, r6, r9 + eor r7, r7, r9 + stm r11, {r4, r5, r6, r7} + # Ladder step + ldr r3, [sp, #184] + ldr r2, [sp, #176] + add r1, sp, #0x80 + ldr r0, [sp, #176] + bl fe_add_sub_op + ldr r3, [sp, #188] + ldr r2, [sp, #180] + add r1, sp, #0x60 + ldr r0, [sp, #184] + bl fe_add_sub_op + ldr r2, [sp, #176] + add r1, sp, #0x60 + ldr r0, [sp, #188] + bl fe_mul_op + add r2, sp, #0x80 ldr r1, [sp, #184] - str r1, [sp, #172] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp] - ldr r7, [sp, #4] -#else - ldrd r6, r7, [sp] -#endif - adds r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #128] - str r11, [sp, #132] -#else - strd r10, r11, [sp, #128] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #8] - ldr r7, [sp, #12] -#else - ldrd r6, r7, [sp, #8] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #136] - str r11, [sp, #140] -#else - strd r10, r11, [sp, #136] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #16] - ldr r7, [sp, #20] -#else - ldrd r6, r7, [sp, #16] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #144] - str r11, [sp, #148] -#else - strd r10, r11, [sp, #144] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #24] - ldr r5, [r0, #28] -#else - ldrd r4, r5, [r0, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #24] - ldr r7, [sp, #28] -#else - ldrd r6, r7, [sp, #24] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r3, #-19 - asr r2, r9, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r3 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r2 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r2 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r2 - sbc r9, r9, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r3, #-19 - asr r2, r11, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #128] - ldr r5, [sp, #132] -#else - ldrd r4, r5, [sp, #128] -#endif - adds r4, r4, r3 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #128] - str r5, [sp, #132] -#else - strd r4, r5, [sp, #128] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #136] - ldr r5, [sp, #140] -#else - ldrd r4, r5, [sp, #136] -#endif - adcs r4, r4, r2 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #136] - str r5, [sp, #140] -#else - strd r4, r5, [sp, #136] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #144] - ldr r5, [sp, #148] -#else - ldrd r4, r5, [sp, #144] -#endif - adcs r4, r4, r2 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #144] - str r5, [sp, #148] -#else - strd r4, r5, [sp, #144] -#endif - adcs r10, r10, r2 - adc r11, r11, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #152] - str r11, [sp, #156] -#else - strd r10, r11, [sp, #152] -#endif - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #64] - ldr r5, [sp, #68] -#else - ldrd r4, r5, [sp, #64] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #32] - ldr r7, [sp, #36] -#else - ldrd r6, r7, [sp, #32] -#endif - adds r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp] - str r9, [sp, #4] -#else - strd r8, r9, [sp] -#endif - # Sub - subs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #96] - str r11, [sp, #100] -#else - strd r10, r11, [sp, #96] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #72] - ldr r5, [sp, #76] -#else - ldrd r4, r5, [sp, #72] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #40] - ldr r7, [sp, #44] -#else - ldrd r6, r7, [sp, #40] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #8] - str r9, [sp, #12] -#else - strd r8, r9, [sp, #8] -#endif - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #104] - str r11, [sp, #108] -#else - strd r10, r11, [sp, #104] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #80] - ldr r5, [sp, #84] -#else - ldrd r4, r5, [sp, #80] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #48] - ldr r7, [sp, #52] -#else - ldrd r6, r7, [sp, #48] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #16] - str r9, [sp, #20] -#else - strd r8, r9, [sp, #16] -#endif - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #112] - str r11, [sp, #116] -#else - strd r10, r11, [sp, #112] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #88] - ldr r5, [sp, #92] -#else - ldrd r4, r5, [sp, #88] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #56] - ldr r7, [sp, #60] -#else - ldrd r6, r7, [sp, #56] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r3, #-19 - asr r2, r9, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp] - ldr r5, [sp, #4] -#else - ldrd r4, r5, [sp] -#endif - subs r4, r4, r3 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp] - str r5, [sp, #4] -#else - strd r4, r5, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #8] - ldr r5, [sp, #12] -#else - ldrd r4, r5, [sp, #8] -#endif - sbcs r4, r4, r2 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #8] - str r5, [sp, #12] -#else - strd r4, r5, [sp, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #16] - ldr r5, [sp, #20] -#else - ldrd r4, r5, [sp, #16] -#endif - sbcs r4, r4, r2 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #16] - str r5, [sp, #20] -#else - strd r4, r5, [sp, #16] -#endif - sbcs r8, r8, r2 - sbc r9, r9, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #24] - str r9, [sp, #28] -#else - strd r8, r9, [sp, #24] -#endif - mov r3, #-19 - asr r2, r11, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #96] - ldr r5, [sp, #100] -#else - ldrd r4, r5, [sp, #96] -#endif - adds r4, r4, r3 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #96] - str r5, [sp, #100] -#else - strd r4, r5, [sp, #96] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #104] - ldr r5, [sp, #108] -#else - ldrd r4, r5, [sp, #104] -#endif - adcs r4, r4, r2 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #104] - str r5, [sp, #108] -#else - strd r4, r5, [sp, #104] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #112] - ldr r5, [sp, #116] -#else - ldrd r4, r5, [sp, #112] -#endif - adcs r4, r4, r2 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #112] - str r5, [sp, #116] -#else - strd r4, r5, [sp, #112] -#endif - adcs r10, r10, r2 - adc r11, r11, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #120] - str r11, [sp, #124] -#else - strd r10, r11, [sp, #120] -#endif - ldr r2, [sp, #160] + ldr r0, [sp, #184] + bl fe_mul_op + add r1, sp, #0x80 + add r0, sp, #0x60 + bl fe_sq_op + ldr r1, [sp, #176] + add r0, sp, #0x80 + bl fe_sq_op + ldr r3, [sp, #184] + ldr r2, [sp, #188] + ldr r1, [sp, #184] + ldr r0, [sp, #180] + bl fe_add_sub_op + add r2, sp, #0x60 + add r1, sp, #0x80 + ldr r0, [sp, #176] + bl fe_mul_op + add r2, sp, #0x60 + add r1, sp, #0x80 + add r0, sp, #0x80 + bl fe_sub_op + ldr r1, [sp, #184] + ldr r0, [sp, #184] + bl fe_sq_op + add r1, sp, #0x80 + ldr r0, [sp, #188] + bl fe_mul121666 + ldr r1, [sp, #180] + ldr r0, [sp, #180] + bl fe_sq_op + ldr r2, [sp, #188] add r1, sp, #0x60 + add r0, sp, #0x60 + bl fe_add_op + ldr r2, [sp, #184] + ldr r1, [sp, #172] + ldr r0, [sp, #188] + bl fe_mul_op + add r2, sp, #0x60 + add r1, sp, #0x80 + ldr r0, [sp, #184] + bl fe_mul_op + ldr r2, [sp, #168] + subs r2, r2, #1 + bge L_curve25519_bits + ldr r1, [sp, #184] + # Copy + ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11} + stm sp, {r4, r5, r6, r7, r8, r9, r10, r11} + # Invert + add r1, sp, #0 add r0, sp, #32 - bl fe_mul - add r2, sp, #0x80 + bl fe_sq_op + add r1, sp, #32 + add r0, sp, #0x40 + bl fe_sq_op + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_sq_op + add r2, sp, #0x40 add r1, sp, #0 - add r0, sp, #0 - bl fe_mul + add r0, sp, #0x40 + bl fe_mul_op + add r2, sp, #0x40 + add r1, sp, #32 + add r0, sp, #32 + bl fe_mul_op + add r1, sp, #32 + add r0, sp, #0x60 + bl fe_sq_op + add r2, sp, #0x60 + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #4 +L_curve25519_inv_1: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_1 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #9 +L_curve25519_inv_2: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_2 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x60 + bl fe_mul_op + add r1, sp, #0x60 + add r0, sp, #0x80 + bl fe_sq_op + mov r12, #19 +L_curve25519_inv_3: + add r1, sp, #0x80 + add r0, sp, #0x80 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_3 + add r2, sp, #0x60 add r1, sp, #0x80 add r0, sp, #0x60 - bl fe_sq - ldr r1, [sp, #160] + bl fe_mul_op + mov r12, #10 +L_curve25519_inv_4: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_4 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #49 +L_curve25519_inv_5: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_5 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x60 + bl fe_mul_op + add r1, sp, #0x60 add r0, sp, #0x80 - bl fe_sq - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #32] - ldr r5, [sp, #36] -#else - ldrd r4, r5, [sp, #32] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp] - ldr r7, [sp, #4] -#else - ldrd r6, r7, [sp] -#endif - adds r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #64] - str r9, [sp, #68] -#else - strd r8, r9, [sp, #64] -#endif - # Sub - subs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp] - str r11, [sp, #4] -#else - strd r10, r11, [sp] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #40] - ldr r5, [sp, #44] -#else - ldrd r4, r5, [sp, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #8] - ldr r7, [sp, #12] -#else - ldrd r6, r7, [sp, #8] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #72] - str r9, [sp, #76] -#else - strd r8, r9, [sp, #72] -#endif - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #8] - str r11, [sp, #12] -#else - strd r10, r11, [sp, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #48] - ldr r5, [sp, #52] -#else - ldrd r4, r5, [sp, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #16] - ldr r7, [sp, #20] -#else - ldrd r6, r7, [sp, #16] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - mov r3, #0 - adcs r9, r5, r7 - adc r3, r3, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #80] - str r9, [sp, #84] -#else - strd r8, r9, [sp, #80] -#endif - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - mov r12, #0 - sbcs r11, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #16] - str r11, [sp, #20] -#else - strd r10, r11, [sp, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #56] - ldr r5, [sp, #60] -#else - ldrd r4, r5, [sp, #56] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #24] - ldr r7, [sp, #28] -#else - ldrd r6, r7, [sp, #24] -#endif - adds r3, r3, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds r12, r12, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r3, #-19 - asr r2, r9, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #64] - ldr r5, [sp, #68] -#else - ldrd r4, r5, [sp, #64] -#endif - subs r4, r4, r3 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #64] - str r5, [sp, #68] -#else - strd r4, r5, [sp, #64] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #72] - ldr r5, [sp, #76] -#else - ldrd r4, r5, [sp, #72] -#endif - sbcs r4, r4, r2 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #72] - str r5, [sp, #76] -#else - strd r4, r5, [sp, #72] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #80] - ldr r5, [sp, #84] -#else - ldrd r4, r5, [sp, #80] -#endif - sbcs r4, r4, r2 - sbcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #80] - str r5, [sp, #84] -#else - strd r4, r5, [sp, #80] -#endif - sbcs r8, r8, r2 - sbc r9, r9, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #88] - str r9, [sp, #92] -#else - strd r8, r9, [sp, #88] -#endif - mov r3, #-19 - asr r2, r11, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp] - ldr r5, [sp, #4] -#else - ldrd r4, r5, [sp] -#endif - adds r4, r4, r3 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp] - str r5, [sp, #4] -#else - strd r4, r5, [sp] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #8] - ldr r5, [sp, #12] -#else - ldrd r4, r5, [sp, #8] -#endif - adcs r4, r4, r2 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #8] - str r5, [sp, #12] -#else - strd r4, r5, [sp, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #16] - ldr r5, [sp, #20] -#else - ldrd r4, r5, [sp, #16] -#endif - adcs r4, r4, r2 - adcs r5, r5, r2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #16] - str r5, [sp, #20] -#else - strd r4, r5, [sp, #16] -#endif - adcs r10, r10, r2 - adc r11, r11, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #24] - str r11, [sp, #28] -#else - strd r10, r11, [sp, #24] -#endif + bl fe_sq_op + mov r12, #0x63 +L_curve25519_inv_6: + add r1, sp, #0x80 + add r0, sp, #0x80 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_6 add r2, sp, #0x60 add r1, sp, #0x80 - ldr r0, [sp, #160] - bl fe_mul - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #128] - ldr r5, [sp, #132] -#else - ldrd r4, r5, [sp, #128] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #136] - ldr r7, [sp, #140] -#else - ldrd r6, r7, [sp, #136] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #96] - ldr r9, [sp, #100] -#else - ldrd r8, r9, [sp, #96] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #104] - ldr r11, [sp, #108] -#else - ldrd r10, r11, [sp, #104] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #128] - str r9, [sp, #132] -#else - strd r8, r9, [sp, #128] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #136] - str r11, [sp, #140] -#else - strd r10, r11, [sp, #136] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #144] - ldr r5, [sp, #148] -#else - ldrd r4, r5, [sp, #144] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #152] - ldr r7, [sp, #156] -#else - ldrd r6, r7, [sp, #152] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #112] - ldr r9, [sp, #116] -#else - ldrd r8, r9, [sp, #112] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #120] - ldr r11, [sp, #124] -#else - ldrd r10, r11, [sp, #120] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r3, #-19 - asr r2, r11, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #128] - ldr r5, [sp, #132] -#else - ldrd r4, r5, [sp, #128] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #136] - ldr r7, [sp, #140] -#else - ldrd r6, r7, [sp, #136] -#endif - adds r4, r4, r3 - adcs r5, r5, r2 - adcs r6, r6, r2 - adcs r7, r7, r2 - adcs r8, r8, r2 - adcs r9, r9, r2 - adcs r10, r10, r2 - adc r11, r11, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #128] - str r5, [sp, #132] -#else - strd r4, r5, [sp, #128] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #136] - str r7, [sp, #140] -#else - strd r6, r7, [sp, #136] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #144] - str r9, [sp, #148] -#else - strd r8, r9, [sp, #144] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #152] - str r11, [sp, #156] -#else - strd r10, r11, [sp, #152] -#endif - add r1, sp, #0 + add r0, sp, #0x60 + bl fe_mul_op + mov r12, #50 +L_curve25519_inv_7: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_7 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + mov r12, #5 +L_curve25519_inv_8: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_curve25519_inv_8 + add r2, sp, #32 + add r1, sp, #0x40 add r0, sp, #0 - bl fe_sq - # Multiply by 121666 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #128] - ldr r5, [sp, #132] -#else - ldrd r4, r5, [sp, #128] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #136] - ldr r7, [sp, #140] -#else - ldrd r6, r7, [sp, #136] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #144] - ldr r9, [sp, #148] -#else - ldrd r8, r9, [sp, #144] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #152] - ldr r11, [sp, #156] -#else - ldrd r10, r11, [sp, #152] -#endif - movw r12, #0xdb42 - movt r12, #1 - umull r4, r2, r4, r12 - umull r5, r3, r5, r12 - adds r5, r5, r2 - adc r2, r3, #0 - umull r6, r3, r6, r12 - adds r6, r6, r2 - adc r2, r3, #0 - umull r7, r3, r7, r12 - adds r7, r7, r2 - adc r2, r3, #0 - umull r8, r3, r8, r12 - adds r8, r8, r2 - adc r2, r3, #0 - umull r9, r3, r9, r12 - adds r9, r9, r2 - adc r2, r3, #0 - umull r10, r3, r10, r12 - adds r10, r10, r2 - adc r2, r3, #0 - umull r11, r3, r11, r12 - adds r11, r11, r2 - adc r2, r3, #0 - mov r12, #19 - lsl r2, r2, #1 - orr r2, r2, r11, lsr #31 - mul r2, r2, r12 - and r11, r11, #0x7fffffff + bl fe_mul_op + ldr r2, [sp, #184] + ldr r1, [sp, #176] + ldr r0, [sp, #176] + bl fe_mul_op + # Ensure result is less than modulus + ldr r0, [sp, #176] + ldm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + mov r2, #19 + and r2, r2, r11, asr #31 adds r4, r4, r2 adcs r5, r5, #0 adcs r6, r6, #0 adcs r7, r7, #0 adcs r8, r8, #0 adcs r9, r9, #0 + bfc r11, #31, #1 adcs r10, r10, #0 adc r11, r11, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #32] - str r5, [sp, #36] -#else - strd r4, r5, [sp, #32] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #40] - str r7, [sp, #44] -#else - strd r6, r7, [sp, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #48] - str r9, [sp, #52] -#else - strd r8, r9, [sp, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #56] - str r11, [sp, #60] -#else - strd r10, r11, [sp, #56] -#endif - add r1, sp, #0x40 - add r0, sp, #0x40 - bl fe_sq - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #96] - ldr r5, [sp, #100] -#else - ldrd r4, r5, [sp, #96] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #104] - ldr r7, [sp, #108] -#else - ldrd r6, r7, [sp, #104] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #32] - ldr r9, [sp, #36] -#else - ldrd r8, r9, [sp, #32] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #40] - ldr r11, [sp, #44] -#else - ldrd r10, r11, [sp, #40] -#endif - adds r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #96] - str r9, [sp, #100] -#else - strd r8, r9, [sp, #96] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #104] - str r11, [sp, #108] -#else - strd r10, r11, [sp, #104] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #112] - ldr r5, [sp, #116] -#else - ldrd r4, r5, [sp, #112] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #120] - ldr r7, [sp, #124] -#else - ldrd r6, r7, [sp, #120] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [sp, #48] - ldr r9, [sp, #52] -#else - ldrd r8, r9, [sp, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [sp, #56] - ldr r11, [sp, #60] -#else - ldrd r10, r11, [sp, #56] -#endif - adcs r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adc r11, r7, r11 - mov r3, #-19 - asr r2, r11, #31 - # Mask the modulus - and r3, r2, r3 - and r12, r2, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [sp, #96] - ldr r5, [sp, #100] -#else - ldrd r4, r5, [sp, #96] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [sp, #104] - ldr r7, [sp, #108] -#else - ldrd r6, r7, [sp, #104] -#endif - subs r4, r4, r3 - sbcs r5, r5, r2 - sbcs r6, r6, r2 - sbcs r7, r7, r2 - sbcs r8, r8, r2 - sbcs r9, r9, r2 - sbcs r10, r10, r2 - sbc r11, r11, r12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [sp, #96] - str r5, [sp, #100] -#else - strd r4, r5, [sp, #96] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [sp, #104] - str r7, [sp, #108] -#else - strd r6, r7, [sp, #104] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [sp, #112] - str r9, [sp, #116] -#else - strd r8, r9, [sp, #112] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [sp, #120] - str r11, [sp, #124] -#else - strd r10, r11, [sp, #120] -#endif - add r2, sp, #0 - ldr r1, [sp, #168] - add r0, sp, #32 - bl fe_mul - add r2, sp, #0x60 - add r1, sp, #0x80 - add r0, sp, #0 - bl fe_mul - ldr r2, [sp, #176] - ldr r1, [sp, #180] - subs r1, r1, #1 - str r1, [sp, #180] - bge L_curve25519_bits - mov r1, #31 - str r1, [sp, #180] - subs r2, r2, #4 - str r2, [sp, #176] - bge L_curve25519_words - # Invert - add r0, sp, #32 - add r1, sp, #0 - bl fe_sq - add r0, sp, #0x40 - add r1, sp, #32 - bl fe_sq - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - add r0, sp, #0x40 - add r1, sp, #0 - add r2, sp, #0x40 - bl fe_mul - add r0, sp, #32 - add r1, sp, #32 - add r2, sp, #0x40 - bl fe_mul - add r0, sp, #0x60 - add r1, sp, #32 - bl fe_sq - add r0, sp, #0x40 - add r1, sp, #0x40 - add r2, sp, #0x60 - bl fe_mul - add r0, sp, #0x60 - add r1, sp, #0x40 - bl fe_sq - mov r4, #4 -L_curve25519_inv_1: - add r0, sp, #0x60 - add r1, sp, #0x60 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_1 - add r0, sp, #0x40 - add r1, sp, #0x60 - add r2, sp, #0x40 - bl fe_mul - add r0, sp, #0x60 - add r1, sp, #0x40 - bl fe_sq - mov r4, #9 -L_curve25519_inv_2: - add r0, sp, #0x60 - add r1, sp, #0x60 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_2 - add r0, sp, #0x60 - add r1, sp, #0x60 - add r2, sp, #0x40 - bl fe_mul - add r0, sp, #0x80 - add r1, sp, #0x60 - bl fe_sq - mov r4, #19 -L_curve25519_inv_3: - add r0, sp, #0x80 - add r1, sp, #0x80 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_3 - add r0, sp, #0x60 - add r1, sp, #0x80 - add r2, sp, #0x60 - bl fe_mul - mov r4, #10 -L_curve25519_inv_4: - add r0, sp, #0x60 - add r1, sp, #0x60 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_4 - add r0, sp, #0x40 - add r1, sp, #0x60 - add r2, sp, #0x40 - bl fe_mul - add r0, sp, #0x60 - add r1, sp, #0x40 - bl fe_sq - mov r4, #49 -L_curve25519_inv_5: - add r0, sp, #0x60 - add r1, sp, #0x60 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_5 - add r0, sp, #0x60 - add r1, sp, #0x60 - add r2, sp, #0x40 - bl fe_mul - add r0, sp, #0x80 - add r1, sp, #0x60 - bl fe_sq - mov r4, #0x63 -L_curve25519_inv_6: - add r0, sp, #0x80 - add r1, sp, #0x80 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_6 - add r0, sp, #0x60 - add r1, sp, #0x80 - add r2, sp, #0x60 - bl fe_mul - mov r4, #50 -L_curve25519_inv_7: - add r0, sp, #0x60 - add r1, sp, #0x60 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_7 - add r0, sp, #0x40 - add r1, sp, #0x60 - add r2, sp, #0x40 - bl fe_mul - mov r4, #5 -L_curve25519_inv_8: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_curve25519_inv_8 - add r0, sp, #0 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - add r2, sp, #0 - ldr r1, [sp, #160] - ldr r0, [sp, #160] - bl fe_mul + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} mov r0, #0 - add sp, sp, #0xbc - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size curve25519,.-curve25519 - .text - .align 4 - .globl fe_pow22523 - .type fe_pow22523, %function -fe_pow22523: - push {r4, lr} - sub sp, sp, #0x68 - # pow22523 - str r0, [sp, #96] - str r1, [sp, #100] - mov r0, sp - ldr r1, [sp, #100] - bl fe_sq - add r0, sp, #32 - mov r1, sp - bl fe_sq - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - add r0, sp, #32 - ldr r1, [sp, #100] - add r2, sp, #32 - bl fe_mul - mov r0, sp - mov r1, sp - add r2, sp, #32 - bl fe_mul - mov r0, sp - mov r1, sp - bl fe_sq - mov r0, sp - add r1, sp, #32 - mov r2, sp - bl fe_mul - add r0, sp, #32 - mov r1, sp - bl fe_sq - mov r4, #4 -L_fe_pow22523_1: - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_1 - mov r0, sp - add r1, sp, #32 - mov r2, sp - bl fe_mul - add r0, sp, #32 - mov r1, sp - bl fe_sq - mov r4, #9 -L_fe_pow22523_2: - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_2 - add r0, sp, #32 - add r1, sp, #32 - mov r2, sp - bl fe_mul - add r0, sp, #0x40 - add r1, sp, #32 - bl fe_sq - mov r4, #19 -L_fe_pow22523_3: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_3 - add r0, sp, #32 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - mov r4, #10 -L_fe_pow22523_4: - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_4 - mov r0, sp - add r1, sp, #32 - mov r2, sp - bl fe_mul - add r0, sp, #32 - mov r1, sp - bl fe_sq - mov r4, #49 -L_fe_pow22523_5: - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_5 - add r0, sp, #32 - add r1, sp, #32 - mov r2, sp - bl fe_mul - add r0, sp, #0x40 - add r1, sp, #32 - bl fe_sq - mov r4, #0x63 -L_fe_pow22523_6: - add r0, sp, #0x40 - add r1, sp, #0x40 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_6 - add r0, sp, #32 - add r1, sp, #0x40 - add r2, sp, #32 - bl fe_mul - mov r4, #50 -L_fe_pow22523_7: - add r0, sp, #32 - add r1, sp, #32 - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_7 - mov r0, sp - add r1, sp, #32 - mov r2, sp - bl fe_mul - mov r4, #2 -L_fe_pow22523_8: - mov r0, sp - mov r1, sp - bl fe_sq - sub r4, r4, #1 - cmp r4, #0 - bne L_fe_pow22523_8 - ldr r0, [sp, #96] - mov r1, sp - ldr r2, [sp, #100] - bl fe_mul - ldr r1, [sp, #100] - ldr r0, [sp, #96] - add sp, sp, #0x68 - pop {r4, pc} - .size fe_pow22523,.-fe_pow22523 - .text - .align 4 - .globl fe_ge_to_p2 - .type fe_ge_to_p2, %function -fe_ge_to_p2: - push {lr} - sub sp, sp, #16 - str r0, [sp] - str r1, [sp, #4] - str r2, [sp, #8] - str r3, [sp, #12] - ldr r2, [sp, #28] - ldr r1, [sp, #12] - ldr r0, [sp] - bl fe_mul - ldr r2, [sp, #24] - ldr r1, [sp, #20] - ldr r0, [sp, #4] - bl fe_mul - ldr r2, [sp, #28] - ldr r1, [sp, #24] - ldr r0, [sp, #8] - bl fe_mul - add sp, sp, #16 - pop {pc} - .size fe_ge_to_p2,.-fe_ge_to_p2 - .text - .align 4 - .globl fe_ge_to_p3 - .type fe_ge_to_p3, %function -fe_ge_to_p3: - push {lr} - sub sp, sp, #16 - str r0, [sp] - str r1, [sp, #4] - str r2, [sp, #8] - str r3, [sp, #12] - ldr r2, [sp, #32] - ldr r1, [sp, #20] - ldr r0, [sp] - bl fe_mul - ldr r2, [sp, #28] - ldr r1, [sp, #24] - ldr r0, [sp, #4] - bl fe_mul - ldr r2, [sp, #32] - ldr r1, [sp, #28] - ldr r0, [sp, #8] - bl fe_mul - ldr r2, [sp, #24] - ldr r1, [sp, #20] - ldr r0, [sp, #12] - bl fe_mul - add sp, sp, #16 - pop {pc} - .size fe_ge_to_p3,.-fe_ge_to_p3 - .text - .align 4 - .globl fe_ge_dbl - .type fe_ge_dbl, %function -fe_ge_dbl: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #16 - str r0, [sp] - str r1, [sp, #4] - str r2, [sp, #8] - str r3, [sp, #12] - ldr r1, [sp, #52] - ldr r0, [sp] - bl fe_sq - ldr r1, [sp, #56] - ldr r0, [sp, #8] - bl fe_sq - ldr r0, [sp, #4] - ldr r1, [sp, #52] - ldr r2, [sp, #56] - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - adds r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - adcs r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r1, [sp, #4] - ldr r0, [sp, #12] - bl fe_sq - ldr r0, [sp, #4] - ldr r1, [sp, #8] - ldr r2, [sp] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r2] - ldr r7, [r2, #4] -#else - ldrd r6, r7, [r2] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r2, #8] - ldr r7, [r2, #12] -#else - ldrd r6, r7, [r2, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r2, #16] - ldr r7, [r2, #20] -#else - ldrd r6, r7, [r2, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #24] - ldr r5, [r1, #28] -#else - ldrd r4, r5, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r2, #24] - ldr r7, [r2, #28] -#else - ldrd r6, r7, [r2, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r3 - sbc r9, r9, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] -#else - strd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] -#else - strd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] -#else - strd r4, r5, [r1, #16] -#endif - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] -#else - strd r10, r11, [r1, #24] -#endif - ldr r0, [sp] - ldr r1, [sp, #12] - ldr r2, [sp, #4] - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 - adcs r6, r6, r3 - adcs r7, r7, r3 - adcs r8, r8, r3 - adcs r9, r9, r3 - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r1, [sp, #60] - ldr r0, [sp, #12] - bl fe_sq2 - ldr r0, [sp, #12] - ldr r1, [sp, #8] - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1] - ldr r9, [r1, #4] -#else - ldrd r8, r9, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r1, #8] - ldr r11, [r1, #12] -#else - ldrd r10, r11, [r1, #8] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #24] - ldr r7, [r0, #28] -#else - ldrd r6, r7, [r0, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1, #16] - ldr r9, [r1, #20] -#else - ldrd r8, r9, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r1, #24] - ldr r11, [r1, #28] -#else - ldrd r10, r11, [r1, #24] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 - adcs r6, r6, r3 - adcs r7, r7, r3 - adcs r8, r8, r3 - adcs r9, r9, r3 - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - add sp, sp, #16 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_ge_dbl,.-fe_ge_dbl - .text - .align 4 - .globl fe_ge_madd - .type fe_ge_madd, %function -fe_ge_madd: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #32 - str r0, [sp] - str r1, [sp, #4] - str r2, [sp, #8] - str r3, [sp, #12] - ldr r0, [sp] - ldr r1, [sp, #72] - ldr r2, [sp, #68] - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - adds r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - adcs r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #4] - ldr r1, [sp, #72] - ldr r2, [sp, #68] - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 - adcs r6, r6, r3 - adcs r7, r7, r3 - adcs r8, r8, r3 - adcs r9, r9, r3 - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r2, [sp, #88] - ldr r1, [sp] - ldr r0, [sp, #8] - bl fe_mul - ldr r2, [sp, #92] - ldr r1, [sp, #4] - ldr r0, [sp, #4] - bl fe_mul - ldr r2, [sp, #80] - ldr r1, [sp, #84] - ldr r0, [sp, #12] - bl fe_mul - ldr r0, [sp, #4] - ldr r1, [sp] - ldr r2, [sp, #8] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2] - ldr r5, [r2, #4] -#else - ldrd r4, r5, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0] - ldr r7, [r0, #4] -#else - ldrd r6, r7, [r0] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #8] - ldr r5, [r2, #12] -#else - ldrd r4, r5, [r2, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #16] - ldr r5, [r2, #20] -#else - ldrd r4, r5, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #16] - ldr r7, [r0, #20] -#else - ldrd r6, r7, [r0, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #24] - ldr r5, [r2, #28] -#else - ldrd r4, r5, [r2, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #24] - ldr r7, [r0, #28] -#else - ldrd r6, r7, [r0, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r3 - sbc r9, r9, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] -#else - strd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] -#else - strd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] -#else - strd r4, r5, [r1, #16] -#endif - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] -#else - strd r10, r11, [r1, #24] -#endif - ldr r0, [sp, #8] - ldr r1, [sp, #76] - # Double -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1, #16] - ldr r9, [r1, #20] -#else - ldrd r8, r9, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r1, #24] - ldr r11, [r1, #28] -#else - ldrd r10, r11, [r1, #24] -#endif - adds r4, r4, r4 - adcs r5, r5, r5 - adcs r6, r6, r6 - adcs r7, r7, r7 - adcs r8, r8, r8 - adcs r9, r9, r9 - adcs r10, r10, r10 - adc r11, r11, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #8] - ldr r1, [sp, #12] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1] - ldr r7, [r1, #4] -#else - ldrd r6, r7, [r1] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #16] - ldr r7, [r1, #20] -#else - ldrd r6, r7, [r1, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #24] - ldr r5, [r0, #28] -#else - ldrd r4, r5, [r0, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r3 - sbc r9, r9, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] -#else - strd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] -#else - strd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] -#else - strd r4, r5, [r1, #16] -#endif - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] -#else - strd r10, r11, [r1, #24] -#endif - add sp, sp, #32 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_ge_madd,.-fe_ge_madd - .text - .align 4 - .globl fe_ge_msub - .type fe_ge_msub, %function -fe_ge_msub: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #32 - str r0, [sp] - str r1, [sp, #4] - str r2, [sp, #8] - str r3, [sp, #12] - ldr r0, [sp] - ldr r1, [sp, #72] - ldr r2, [sp, #68] - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - adds r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - adcs r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #4] - ldr r1, [sp, #72] - ldr r2, [sp, #68] - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 - adcs r6, r6, r3 - adcs r7, r7, r3 - adcs r8, r8, r3 - adcs r9, r9, r3 - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r2, [sp, #92] - ldr r1, [sp] - ldr r0, [sp, #8] - bl fe_mul - ldr r2, [sp, #88] - ldr r1, [sp, #4] - ldr r0, [sp, #4] - bl fe_mul - ldr r2, [sp, #80] - ldr r1, [sp, #84] - ldr r0, [sp, #12] - bl fe_mul - ldr r0, [sp, #4] - ldr r1, [sp] - ldr r2, [sp, #8] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2] - ldr r5, [r2, #4] -#else - ldrd r4, r5, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0] - ldr r7, [r0, #4] -#else - ldrd r6, r7, [r0] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #8] - ldr r5, [r2, #12] -#else - ldrd r4, r5, [r2, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #16] - ldr r5, [r2, #20] -#else - ldrd r4, r5, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #16] - ldr r7, [r0, #20] -#else - ldrd r6, r7, [r0, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #24] - ldr r5, [r2, #28] -#else - ldrd r4, r5, [r2, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #24] - ldr r7, [r0, #28] -#else - ldrd r6, r7, [r0, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r3 - sbc r9, r9, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] -#else - strd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] -#else - strd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] -#else - strd r4, r5, [r1, #16] -#endif - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] -#else - strd r10, r11, [r1, #24] -#endif - ldr r0, [sp, #8] - ldr r1, [sp, #76] - # Double -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1, #16] - ldr r9, [r1, #20] -#else - ldrd r8, r9, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r1, #24] - ldr r11, [r1, #28] -#else - ldrd r10, r11, [r1, #24] -#endif - adds r4, r4, r4 - adcs r5, r5, r5 - adcs r6, r6, r6 - adcs r7, r7, r7 - adcs r8, r8, r8 - adcs r9, r9, r9 - adcs r10, r10, r10 - adc r11, r11, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #12] - ldr r1, [sp, #8] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0] - ldr r7, [r0, #4] -#else - ldrd r6, r7, [r0] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #16] - ldr r7, [r0, #20] -#else - ldrd r6, r7, [r0, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #24] - ldr r5, [r1, #28] -#else - ldrd r4, r5, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #24] - ldr r7, [r0, #28] -#else - ldrd r6, r7, [r0, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r3 - sbc r9, r9, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] -#else - strd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] -#else - strd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] -#else - strd r4, r5, [r1, #16] -#endif - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] -#else - strd r10, r11, [r1, #24] -#endif - add sp, sp, #32 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_ge_msub,.-fe_ge_msub - .text - .align 4 - .globl fe_ge_add - .type fe_ge_add, %function -fe_ge_add: - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x60 - str r0, [sp] - str r1, [sp, #4] - str r2, [sp, #8] - str r3, [sp, #12] - ldr r0, [sp] - ldr r1, [sp, #136] - ldr r2, [sp, #132] - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - adds r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - adcs r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #4] - ldr r1, [sp, #136] - ldr r2, [sp, #132] - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 - adcs r6, r6, r3 - adcs r7, r7, r3 - adcs r8, r8, r3 - adcs r9, r9, r3 - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r2, [sp, #156] - ldr r1, [sp] - ldr r0, [sp, #8] - bl fe_mul - ldr r2, [sp, #160] - ldr r1, [sp, #4] - ldr r0, [sp, #4] - bl fe_mul - ldr r2, [sp, #144] - ldr r1, [sp, #152] - ldr r0, [sp, #12] - bl fe_mul - ldr r2, [sp, #148] - ldr r1, [sp, #140] - ldr r0, [sp] - bl fe_mul - add r0, sp, #16 - ldr r1, [sp] - # Double -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1, #16] - ldr r9, [r1, #20] -#else - ldrd r8, r9, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r1, #24] - ldr r11, [r1, #28] -#else - ldrd r10, r11, [r1, #24] -#endif - adds r4, r4, r4 - adcs r5, r5, r5 - adcs r6, r6, r6 - adcs r7, r7, r7 - adcs r8, r8, r8 - adcs r9, r9, r9 - adcs r10, r10, r10 - adc r11, r11, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #4] - ldr r1, [sp] - ldr r2, [sp, #8] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2] - ldr r5, [r2, #4] -#else - ldrd r4, r5, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0] - ldr r7, [r0, #4] -#else - ldrd r6, r7, [r0] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #8] - ldr r5, [r2, #12] -#else - ldrd r4, r5, [r2, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #16] - ldr r5, [r2, #20] -#else - ldrd r4, r5, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #16] - ldr r7, [r0, #20] -#else - ldrd r6, r7, [r0, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #24] - ldr r5, [r2, #28] -#else - ldrd r4, r5, [r2, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #24] - ldr r7, [r0, #28] -#else - ldrd r6, r7, [r0, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r3 - sbc r9, r9, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] -#else - strd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] -#else - strd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] -#else - strd r4, r5, [r1, #16] -#endif - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] -#else - strd r10, r11, [r1, #24] -#endif - ldr r0, [sp, #8] - ldr r1, [sp, #12] - add r2, sp, #16 - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2] - ldr r5, [r2, #4] -#else - ldrd r4, r5, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1] - ldr r7, [r1, #4] -#else - ldrd r6, r7, [r1] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #8] - ldr r5, [r2, #12] -#else - ldrd r4, r5, [r2, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #16] - ldr r5, [r2, #20] -#else - ldrd r4, r5, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #16] - ldr r7, [r1, #20] -#else - ldrd r6, r7, [r1, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 + add sp, sp, #0xc0 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size curve25519,.-curve25519 +#endif /* WC_NO_CACHE_RESISTANT */ +#ifdef HAVE_ED25519 + .text + .align 4 + .globl fe_invert + .type fe_invert, %function +fe_invert: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x88 + # Invert + str r0, [sp, #128] + str r1, [sp, #132] + ldr r1, [sp, #132] + mov r0, sp + bl fe_sq_op + mov r1, sp + add r0, sp, #32 + bl fe_sq_op + add r1, sp, #32 + add r0, sp, #32 + bl fe_sq_op + add r2, sp, #32 + ldr r1, [sp, #132] + add r0, sp, #32 + bl fe_mul_op + add r2, sp, #32 + mov r1, sp + mov r0, sp + bl fe_mul_op + mov r1, sp + add r0, sp, #0x40 + bl fe_sq_op + add r2, sp, #0x40 + add r1, sp, #32 + add r0, sp, #32 + bl fe_mul_op + add r1, sp, #32 + add r0, sp, #0x40 + bl fe_sq_op + mov r12, #4 +L_fe_invert1: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert1 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #32 + bl fe_mul_op + add r1, sp, #32 + add r0, sp, #0x40 + bl fe_sq_op + mov r12, #9 +L_fe_invert2: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert2 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #19 +L_fe_invert3: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert3 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + mov r12, #10 +L_fe_invert4: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert4 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #32 + bl fe_mul_op + add r1, sp, #32 + add r0, sp, #0x40 + bl fe_sq_op + mov r12, #49 +L_fe_invert5: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert5 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #0x40 + bl fe_mul_op + add r1, sp, #0x40 + add r0, sp, #0x60 + bl fe_sq_op + mov r12, #0x63 +L_fe_invert6: + add r1, sp, #0x60 + add r0, sp, #0x60 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert6 + add r2, sp, #0x40 + add r1, sp, #0x60 + add r0, sp, #0x40 + bl fe_mul_op + mov r12, #50 +L_fe_invert7: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert7 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #32 + bl fe_mul_op + mov r12, #5 +L_fe_invert8: + add r1, sp, #32 + add r0, sp, #32 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_invert8 + mov r2, sp + add r1, sp, #32 + ldr r0, [sp, #128] + bl fe_mul_op + ldr r1, [sp, #132] + ldr r0, [sp, #128] + add sp, sp, #0x88 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size fe_invert,.-fe_invert + .text + .align 4 + .globl fe_sq2 + .type fe_sq2, %function +fe_sq2: + push {lr} + sub sp, sp, #36 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] + str r0, [sp, #28] + str r1, [sp, #32] #else - strd r8, r9, [r0, #16] + strd r0, r1, [sp, #28] #endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 + ldm r1, {r0, r1, r2, r3, r4, r5, r6, r7} + # Square * 2 + umull r9, r10, r0, r0 + umull r11, r12, r0, r1 + adds r11, r11, r11 mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #24] - ldr r5, [r2, #28] -#else - ldrd r4, r5, [r2, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] -#else - strd r4, r5, [r0, #16] -#endif - sbcs r8, r8, r3 - sbc r9, r9, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] -#else - strd r8, r9, [r0, #24] -#endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] -#else - strd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] -#else - ldrd r4, r5, [r1, #8] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] -#else - strd r4, r5, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif - adcs r4, r4, r3 - adcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] -#else - strd r4, r5, [r1, #16] -#endif - adcs r10, r10, r3 - adc r11, r11, lr + umaal r10, r11, lr, lr + stm sp, {r9, r10} + mov r8, lr + umaal r8, r12, r0, r2 + adcs r8, r8, r8 + umaal r8, r11, r1, r1 + umull r9, r10, r0, r3 + umaal r9, r12, r1, r2 + adcs r9, r9, r9 + umaal r9, r11, lr, lr #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] + str r8, [sp, #8] + str r9, [sp, #12] #else - strd r10, r11, [r1, #24] + strd r8, r9, [sp, #8] #endif - add sp, sp, #0x60 + mov r9, lr + umaal r9, r10, r0, r4 + umaal r9, r12, r1, r3 + adcs r9, r9, r9 + umaal r9, r11, r2, r2 + str r9, [sp, #16] + umull r9, r8, r0, r5 + umaal r9, r12, r1, r4 + umaal r9, r10, r2, r3 + adcs r9, r9, r9 + umaal r9, r11, lr, lr + str r9, [sp, #20] + mov r9, lr + umaal r9, r8, r0, r6 + umaal r9, r12, r1, r5 + umaal r9, r10, r2, r4 + adcs r9, r9, r9 + umaal r9, r11, r3, r3 + str r9, [sp, #24] + umull r0, r9, r0, r7 + umaal r0, r8, r1, r6 + umaal r0, r12, r2, r5 + umaal r0, r10, r3, r4 + adcs r0, r0, r0 + umaal r0, r11, lr, lr + # R[7] = r0 + umaal r9, r8, r1, r7 + umaal r9, r10, r2, r6 + umaal r12, r9, r3, r5 + adcs r12, r12, r12 + umaal r12, r11, r4, r4 + # R[8] = r12 + umaal r9, r8, r2, r7 + umaal r10, r9, r3, r6 + mov r2, lr + umaal r10, r2, r4, r5 + adcs r10, r10, r10 + umaal r11, r10, lr, lr + # R[9] = r11 + umaal r2, r8, r3, r7 + umaal r2, r9, r4, r6 + adcs r3, r2, r2 + umaal r10, r3, r5, r5 + # R[10] = r10 + mov r1, lr + umaal r1, r8, r4, r7 + umaal r1, r9, r5, r6 + adcs r4, r1, r1 + umaal r3, r4, lr, lr + # R[11] = r3 + umaal r8, r9, r5, r7 + adcs r8, r8, r8 + umaal r4, r8, r6, r6 + # R[12] = r4 + mov r5, lr + umaal r5, r9, r6, r7 + adcs r5, r5, r5 + umaal r8, r5, lr, lr + # R[13] = r8 + adcs r9, r9, r9 + umaal r9, r5, r7, r7 + adcs r7, r5, lr + # R[14] = r9 + # R[15] = r7 + # Reduce + mov r6, #37 + umaal r7, r0, r7, r6 + mov r6, #19 + lsl r0, r0, #1 + orr r0, r0, r7, lsr #31 + mul lr, r0, r6 + pop {r0, r1} + mov r6, #38 + umaal r0, lr, r12, r6 + umaal r1, lr, r11, r6 + mov r12, r3 + mov r11, r4 + pop {r2, r3, r4} + umaal r2, lr, r10, r6 + umaal r3, lr, r12, r6 + umaal r4, lr, r11, r6 + mov r12, r6 + pop {r5, r6} + umaal r5, lr, r8, r12 + bfc r7, #31, #1 + umaal r6, lr, r9, r12 + add r7, r7, lr + # Reduce if top bit set + mov r11, #19 + and r12, r11, r7, ASR #31 + adds r0, r0, r12 + adcs r1, r1, #0 + adcs r2, r2, #0 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + bfc r7, #31, #1 + adcs r6, r6, #0 + adc r7, r7, #0 + # Double + adds r0, r0, r0 + adcs r1, r1, r1 + adcs r2, r2, r2 + adcs r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adc r7, r7, r7 + # Reduce if top bit set + mov r11, #19 + and r12, r11, r7, ASR #31 + adds r0, r0, r12 + adcs r1, r1, #0 + adcs r2, r2, #0 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + bfc r7, #31, #1 + adcs r6, r6, #0 + adc r7, r7, #0 + pop {r12, lr} + # Store + stm r12, {r0, r1, r2, r3, r4, r5, r6, r7} + mov r0, r12 + mov r1, lr + pop {pc} + .size fe_sq2,.-fe_sq2 + .text + .align 4 + .globl fe_pow22523 + .type fe_pow22523, %function +fe_pow22523: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x68 + # pow22523 + str r0, [sp, #96] + str r1, [sp, #100] + ldr r1, [sp, #100] + mov r0, sp + bl fe_sq_op + mov r1, sp + add r0, sp, #32 + bl fe_sq_op + add r1, sp, #32 + add r0, sp, #32 + bl fe_sq_op + add r2, sp, #32 + ldr r1, [sp, #100] + add r0, sp, #32 + bl fe_mul_op + add r2, sp, #32 + mov r1, sp + mov r0, sp + bl fe_mul_op + mov r1, sp + mov r0, sp + bl fe_sq_op + mov r2, sp + add r1, sp, #32 + mov r0, sp + bl fe_mul_op + mov r1, sp + add r0, sp, #32 + bl fe_sq_op + mov r12, #4 +L_fe_pow22523_1: + add r1, sp, #32 + add r0, sp, #32 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_1 + mov r2, sp + add r1, sp, #32 + mov r0, sp + bl fe_mul_op + mov r1, sp + add r0, sp, #32 + bl fe_sq_op + mov r12, #9 +L_fe_pow22523_2: + add r1, sp, #32 + add r0, sp, #32 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_2 + mov r2, sp + add r1, sp, #32 + add r0, sp, #32 + bl fe_mul_op + add r1, sp, #32 + add r0, sp, #0x40 + bl fe_sq_op + mov r12, #19 +L_fe_pow22523_3: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_3 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #32 + bl fe_mul_op + mov r12, #10 +L_fe_pow22523_4: + add r1, sp, #32 + add r0, sp, #32 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_4 + mov r2, sp + add r1, sp, #32 + mov r0, sp + bl fe_mul_op + mov r1, sp + add r0, sp, #32 + bl fe_sq_op + mov r12, #49 +L_fe_pow22523_5: + add r1, sp, #32 + add r0, sp, #32 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_5 + mov r2, sp + add r1, sp, #32 + add r0, sp, #32 + bl fe_mul_op + add r1, sp, #32 + add r0, sp, #0x40 + bl fe_sq_op + mov r12, #0x63 +L_fe_pow22523_6: + add r1, sp, #0x40 + add r0, sp, #0x40 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_6 + add r2, sp, #32 + add r1, sp, #0x40 + add r0, sp, #32 + bl fe_mul_op + mov r12, #50 +L_fe_pow22523_7: + add r1, sp, #32 + add r0, sp, #32 + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_7 + mov r2, sp + add r1, sp, #32 + mov r0, sp + bl fe_mul_op + mov r12, #2 +L_fe_pow22523_8: + mov r1, sp + mov r0, sp + push {r12} + bl fe_sq_op + pop {r12} + subs r12, r12, #1 + bne L_fe_pow22523_8 + ldr r2, [sp, #100] + mov r1, sp + ldr r0, [sp, #96] + bl fe_mul_op + ldr r1, [sp, #100] + ldr r0, [sp, #96] + add sp, sp, #0x68 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_ge_add,.-fe_ge_add + .size fe_pow22523,.-fe_pow22523 .text .align 4 - .globl fe_ge_sub - .type fe_ge_sub, %function -fe_ge_sub: + .globl ge_p1p1_to_p2 + .type ge_p1p1_to_p2, %function +ge_p1p1_to_p2: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x60 + sub sp, sp, #8 str r0, [sp] str r1, [sp, #4] - str r2, [sp, #8] - str r3, [sp, #12] + add r2, r1, #0x60 + bl fe_mul_op ldr r0, [sp] - ldr r1, [sp, #136] - ldr r2, [sp, #132] - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - adds r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - adcs r8, r4, r8 - adcs r9, r5, r9 - adcs r10, r6, r10 - adc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #4] - ldr r1, [sp, #136] - ldr r2, [sp, #132] - # Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2] - ldr r9, [r2, #4] -#else - ldrd r8, r9, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #8] - ldr r11, [r2, #12] -#else - ldrd r10, r11, [r2, #8] -#endif - subs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbcs r11, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #8] - str r11, [r0, #12] -#else - strd r10, r11, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] -#else - ldrd r4, r5, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #24] - ldr r7, [r1, #28] -#else - ldrd r6, r7, [r1, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r2, #16] - ldr r9, [r2, #20] -#else - ldrd r8, r9, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r2, #24] - ldr r11, [r2, #28] -#else - ldrd r10, r11, [r2, #24] -#endif - sbcs r8, r4, r8 - sbcs r9, r5, r9 - sbcs r10, r6, r10 - sbc r11, r7, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r4, r4, r12 - adcs r5, r5, r3 - adcs r6, r6, r3 - adcs r7, r7, r3 - adcs r8, r8, r3 - adcs r9, r9, r3 - adcs r10, r10, r3 - adc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r2, [sp, #160] - ldr r1, [sp] - ldr r0, [sp, #8] - bl fe_mul - ldr r2, [sp, #156] ldr r1, [sp, #4] - ldr r0, [sp, #4] - bl fe_mul - ldr r2, [sp, #144] - ldr r1, [sp, #152] - ldr r0, [sp, #12] - bl fe_mul - ldr r2, [sp, #148] - ldr r1, [sp, #140] + add r2, r1, #0x40 + add r1, r1, #32 + add r0, r0, #32 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #4] + add r2, r1, #0x60 + add r1, r1, #0x40 + add r0, r0, #0x40 + bl fe_mul_op + add sp, sp, #8 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size ge_p1p1_to_p2,.-ge_p1p1_to_p2 + .text + .align 4 + .globl ge_p1p1_to_p3 + .type ge_p1p1_to_p3, %function +ge_p1p1_to_p3: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #8 + str r0, [sp] + str r1, [sp, #4] + add r2, r1, #0x60 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #4] + add r2, r1, #0x40 + add r1, r1, #32 + add r0, r0, #32 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #4] + add r2, r1, #0x60 + add r1, r1, #0x40 + add r0, r0, #0x40 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #4] + add r2, r1, #32 + add r0, r0, #0x60 + bl fe_mul_op + add sp, sp, #8 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size ge_p1p1_to_p3,.-ge_p1p1_to_p3 + .text + .align 4 + .globl ge_p2_dbl + .type ge_p2_dbl, %function +ge_p2_dbl: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #8 + str r0, [sp] + str r1, [sp, #4] + bl fe_sq_op + ldr r0, [sp] + ldr r1, [sp, #4] + add r1, r1, #32 + add r0, r0, #0x40 + bl fe_sq_op + ldr r0, [sp] + ldr r1, [sp, #4] + add r2, r1, #32 + add r0, r0, #32 + bl fe_add_op + mov r1, r0 + add r0, r0, #0x40 + bl fe_sq_op + ldr r0, [sp] + mov r3, r0 + add r2, r0, #0x40 + add r1, r0, #0x40 + add r0, r0, #32 + bl fe_add_sub_op + mov r2, r0 + add r1, r0, #0x40 + sub r0, r0, #32 + bl fe_sub_op + ldr r1, [sp, #4] + add r1, r1, #0x40 + add r0, r0, #0x60 + bl fe_sq2 + sub r2, r0, #32 + mov r1, r0 + bl fe_sub_op + add sp, sp, #8 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size ge_p2_dbl,.-ge_p2_dbl + .text + .align 4 + .globl ge_madd + .type ge_madd, %function +ge_madd: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #12 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + mov r2, r1 + add r1, r1, #32 + bl fe_add_op + ldr r1, [sp, #4] + mov r2, r1 + add r1, r1, #32 + add r0, r0, #32 + bl fe_sub_op + ldr r2, [sp, #8] + sub r1, r0, #32 + add r0, r0, #32 + bl fe_mul_op ldr r0, [sp] - bl fe_mul - add r0, sp, #16 - ldr r1, [sp] + ldr r2, [sp, #8] + add r2, r2, #32 + add r1, r0, #32 + add r0, r0, #32 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #8] + ldr r2, [sp, #4] + add r2, r2, #0x60 + add r1, r1, #0x40 + add r0, r0, #0x60 + bl fe_mul_op + ldr r0, [sp] + add r3, r0, #32 + add r2, r0, #0x40 + mov r1, r0 + add r0, r0, #32 + bl fe_add_sub_op + ldr r1, [sp, #4] + add r1, r1, #0x40 + add r0, r0, #32 # Double -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r1, #8] - ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r8, [r1, #16] - ldr r9, [r1, #20] -#else - ldrd r8, r9, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r10, [r1, #24] - ldr r11, [r1, #28] -#else - ldrd r10, r11, [r1, #24] -#endif + ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11} adds r4, r4, r4 adcs r5, r5, r5 adcs r6, r6, r6 @@ -9447,537 +4307,1214 @@ fe_ge_sub: adcs r8, r8, r8 adcs r9, r9, r9 adcs r10, r10, r10 - adc r11, r11, r11 - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) - subs r4, r4, r12 - sbcs r5, r5, r3 - sbcs r6, r6, r3 - sbcs r7, r7, r3 - sbcs r8, r8, r3 - sbcs r9, r9, r3 - sbcs r10, r10, r3 - sbc r11, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r6, [r0, #8] - str r7, [r0, #12] -#else - strd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r0, #24] - str r11, [r0, #28] -#else - strd r10, r11, [r0, #24] -#endif - ldr r0, [sp, #4] - ldr r1, [sp] - ldr r2, [sp, #8] - # Add-Sub - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2] - ldr r5, [r2, #4] -#else - ldrd r4, r5, [r2] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0] - ldr r7, [r0, #4] -#else - ldrd r6, r7, [r0] -#endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] -#else - strd r8, r9, [r0] -#endif - # Sub - subs r10, r4, r6 mov lr, #0 - sbcs r11, r5, r7 + adcs r11, r11, r11 adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #8] - ldr r5, [r2, #12] -#else - ldrd r4, r5, [r2, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] -#else - strd r8, r9, [r0, #8] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 + mov r12, #19 + lsl lr, lr, #1 + orr lr, lr, r11, lsr #31 + mul r12, lr, r12 + adds r4, r4, r12 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + bfc r11, #31, #1 + adcs r10, r10, #0 + adc r11, r11, #0 + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + add r3, r0, #32 + add r1, r0, #32 + bl fe_add_sub_op + add sp, sp, #12 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size ge_madd,.-ge_madd + .text + .align 4 + .globl ge_msub + .type ge_msub, %function +ge_msub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #12 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + mov r2, r1 + add r1, r1, #32 + bl fe_add_op + ldr r1, [sp, #4] + mov r2, r1 + add r1, r1, #32 + add r0, r0, #32 + bl fe_sub_op + ldr r2, [sp, #8] + add r2, r2, #32 + sub r1, r0, #32 + add r0, r0, #32 + bl fe_mul_op + ldr r0, [sp] + ldr r2, [sp, #8] + add r1, r0, #32 + add r0, r0, #32 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #8] + ldr r2, [sp, #4] + add r2, r2, #0x60 + add r1, r1, #0x40 + add r0, r0, #0x60 + bl fe_mul_op + ldr r0, [sp] + add r3, r0, #32 + add r2, r0, #0x40 + mov r1, r0 + add r0, r0, #32 + bl fe_add_sub_op + ldr r1, [sp, #4] + add r1, r1, #0x40 + add r0, r0, #32 + # Double + ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11} + adds r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 mov lr, #0 - sbcs r11, r5, r7 + adcs r11, r11, r11 adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] -#else - strd r10, r11, [r1, #8] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #16] - ldr r5, [r2, #20] -#else - ldrd r4, r5, [r2, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #16] - ldr r7, [r0, #20] -#else - ldrd r6, r7, [r0, #16] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 + mov r12, #19 + lsl lr, lr, #1 + orr lr, lr, r11, lsr #31 + mul r12, lr, r12 + adds r4, r4, r12 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + bfc r11, #31, #1 + adcs r10, r10, #0 + adc r11, r11, #0 + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + add r3, r0, #32 + mov r1, r0 + add r0, r0, #32 + bl fe_add_sub_op + add sp, sp, #12 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size ge_msub,.-ge_msub + .text + .align 4 + .globl ge_add + .type ge_add, %function +ge_add: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #44 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + mov r3, r1 + add r2, r1, #32 + add r1, r0, #32 + bl fe_add_sub_op + ldr r2, [sp, #8] + mov r1, r0 + add r0, r0, #0x40 + bl fe_mul_op + ldr r0, [sp] + ldr r2, [sp, #8] + add r2, r2, #32 + add r1, r0, #32 + add r0, r0, #32 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #8] + ldr r2, [sp, #4] + add r2, r2, #0x60 + add r1, r1, #0x60 + add r0, r0, #0x60 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #4] + ldr r2, [sp, #8] + add r2, r2, #0x40 + add r1, r1, #0x40 + bl fe_mul_op + ldr r1, [sp] + add r0, sp, #12 + # Double + ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11} + adds r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] -#else - strd r10, r11, [r1, #16] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #24] - ldr r5, [r2, #28] -#else - ldrd r4, r5, [r2, #24] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #24] - ldr r7, [r0, #28] -#else - ldrd r6, r7, [r0, #24] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] -#else - ldrd r4, r5, [r0] -#endif - subs r4, r4, r12 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] -#else - strd r4, r5, [r0] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] -#else - ldrd r4, r5, [r0, #8] -#endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] -#else - strd r4, r5, [r0, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] + adcs r11, r11, r11 + adc lr, lr, #0 + mov r12, #19 + lsl lr, lr, #1 + orr lr, lr, r11, lsr #31 + mul r12, lr, r12 + adds r4, r4, r12 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + bfc r11, #31, #1 + adcs r10, r10, #0 + adc r11, r11, #0 + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + add r3, r1, #32 + add r2, r1, #0x40 + add r0, r1, #32 + bl fe_add_sub_op + add r3, r0, #0x40 + add r2, sp, #12 + add r1, r0, #0x40 + add r0, r0, #32 + bl fe_add_sub_op + add sp, sp, #44 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size ge_add,.-ge_add + .text + .align 4 + .globl ge_sub + .type ge_sub, %function +ge_sub: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #44 + str r0, [sp] + str r1, [sp, #4] + str r2, [sp, #8] + mov r3, r1 + add r2, r1, #32 + add r1, r0, #32 + bl fe_add_sub_op + ldr r2, [sp, #8] + add r2, r2, #32 + mov r1, r0 + add r0, r0, #0x40 + bl fe_mul_op + ldr r0, [sp] + ldr r2, [sp, #8] + add r1, r0, #32 + add r0, r0, #32 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #8] + ldr r2, [sp, #4] + add r2, r2, #0x60 + add r1, r1, #0x60 + add r0, r0, #0x60 + bl fe_mul_op + ldr r0, [sp] + ldr r1, [sp, #4] + ldr r2, [sp, #8] + add r2, r2, #0x40 + add r1, r1, #0x40 + bl fe_mul_op + ldr r1, [sp] + add r0, sp, #12 + # Double + ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11} + adds r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + mov lr, #0 + adcs r11, r11, r11 + adc lr, lr, #0 + mov r12, #19 + lsl lr, lr, #1 + orr lr, lr, r11, lsr #31 + mul r12, lr, r12 + adds r4, r4, r12 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + bfc r11, #31, #1 + adcs r10, r10, #0 + adc r11, r11, #0 + stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + add r3, r1, #32 + add r2, r1, #0x40 + add r0, r1, #32 + bl fe_add_sub_op + add r3, r0, #0x40 + add r2, sp, #12 + add r1, r0, #32 + add r0, r0, #0x40 + bl fe_add_sub_op + add sp, sp, #44 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size ge_sub,.-ge_sub + .text + .align 4 + .globl sc_reduce + .type sc_reduce, %function +sc_reduce: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #52 + # Load bits 252-511 + add r0, r0, #28 + ldm r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} + lsr lr, r9, #24 + lsl r9, r9, #4 + orr r9, r9, r8, lsr #28 + lsl r8, r8, #4 + orr r8, r8, r7, lsr #28 + lsl r7, r7, #4 + orr r7, r7, r6, lsr #28 + lsl r6, r6, #4 + orr r6, r6, r5, lsr #28 + lsl r5, r5, #4 + orr r5, r5, r4, lsr #28 + lsl r4, r4, #4 + orr r4, r4, r3, lsr #28 + lsl r3, r3, #4 + orr r3, r3, r2, lsr #28 + lsl r2, r2, #4 + orr r2, r2, r1, lsr #28 + bfc r9, #28, #4 + sub r0, r0, #28 + # Add order times bits 504..511 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r10, #0x2c + lsl r10, r10, #8 + add r10, r10, #0x13 +#else + mov r10, #0x2c13 +#endif + movt r10, #0xa30a +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r11, #0x9c + lsl r11, r11, #8 + add r11, r11, #0xe5 #else - ldrd r4, r5, [r0, #16] + mov r11, #0x9ce5 #endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 + movt r11, #0xa7ed + mov r1, #0 + umlal r2, r1, r10, lr + umaal r3, r1, r11, lr #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] + mov r10, #0x63 + lsl r10, r10, #8 + add r10, r10, #0x29 #else - strd r4, r5, [r0, #16] + mov r10, #0x6329 #endif - sbcs r8, r8, r3 - sbc r9, r9, lr + movt r10, #0x5d08 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] + mov r11, #0x6 + lsl r11, r11, #8 + add r11, r11, #0x21 #else - strd r8, r9, [r0, #24] + mov r11, #0x621 #endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) + movt r11, #0xeb21 + umaal r4, r1, r10, lr + umaal r5, r1, r11, lr + adds r6, r6, r1 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + subs r6, r6, lr + sbcs r7, r7, #0 + sbcs r8, r8, #0 + sbc r9, r9, #0 + # Sub product of top 8 words and order #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 #else - ldrd r4, r5, [r1] + mov r1, #0x2c13 #endif - adds r4, r4, r12 - adcs r5, r5, r3 + movt r1, #0xa30a + mov lr, #0 + ldm r0!, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm r0!, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm r0!, {r10, r11} + umaal r10, lr, r8, r1 + bfc r11, #28, #4 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub r0, r0, #16 + sub sp, sp, #32 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 #else - strd r4, r5, [r1] + mov r1, #0x9ce5 #endif + movt r1, #0xa7ed + mov lr, #0 + ldm sp, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub sp, sp, #32 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 #else - ldrd r4, r5, [r1, #8] + mov r1, #0x6329 #endif - adcs r4, r4, r3 - adcs r5, r5, r3 + movt r1, #0x5d08 + mov lr, #0 + ldm sp, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub sp, sp, #32 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 #else - strd r4, r5, [r1, #8] + mov r1, #0x621 #endif + movt r1, #0xeb21 + mov lr, #0 + ldm sp, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub sp, sp, #32 + # Subtract at 4 * 32 + ldm sp, {r10, r11, r12} + subs r10, r10, r2 + sbcs r11, r11, r3 + sbcs r12, r12, r4 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + sbcs r10, r10, r5 + sbcs r11, r11, r6 + sbcs r12, r12, r7 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + sbcs r10, r10, r8 + sbc r11, r11, r9 + stm sp!, {r10, r11} + sub sp, sp, #36 + asr lr, r11, #25 + # Conditionally subtract order starting at bit 125 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] + mov r1, #0xa00000 + lsl r1, r1, #8 + add r1, r1, #0x0 #else - ldrd r4, r5, [r1, #16] + mov r1, #0xa0000000 #endif - adcs r4, r4, r3 - adcs r5, r5, r3 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] + mov r2, #0xba + lsl r2, r2, #8 + add r2, r2, #0x7d #else - strd r4, r5, [r1, #16] + mov r2, #0xba7d #endif - adcs r10, r10, r3 - adc r11, r11, lr + movt r2, #0x4b9e #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] + mov r3, #0x4c + lsl r3, r3, #8 + add r3, r3, #0x63 #else - strd r10, r11, [r1, #24] + mov r3, #0x4c63 #endif - ldr r0, [sp, #12] - ldr r1, [sp, #8] - add r2, sp, #16 - # Add-Sub - # Add + movt r3, #0xcb02 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2] - ldr r5, [r2, #4] + mov r4, #0xf3 + lsl r4, r4, #8 + add r4, r4, #0x9a #else - ldrd r4, r5, [r2] + mov r4, #0xf39a #endif + movt r4, #0xd45e #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0] - ldr r7, [r0, #4] + mov r5, #0xdf + lsl r5, r5, #8 + add r5, r5, #0x3b #else - ldrd r6, r7, [r0] + mov r5, #0xdf3b #endif - adds r8, r4, r6 - mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 + movt r5, #0x29b #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0] - str r9, [r0, #4] + mov r9, #0x20000 + lsl r9, r9, #8 + add r9, r9, #0x0 #else - strd r8, r9, [r0] + mov r9, #0x2000000 #endif - # Sub - subs r10, r4, r6 + and r1, r1, lr + and r2, r2, lr + and r3, r3, lr + and r4, r4, lr + and r5, r5, lr + and r9, r9, lr + ldm sp, {r10, r11, r12} + adds r10, r10, r1 + adcs r11, r11, r2 + adcs r12, r12, r3 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + adcs r10, r10, r4 + adcs r11, r11, r5 + adcs r12, r12, #0 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + adcs r10, r10, #0 + adcs r11, r11, #0 + adcs r12, r12, r9 + stm sp!, {r10, r11, r12} + sub sp, sp, #48 + sub r0, r0, #16 + # Load bits 252-376 + add sp, sp, #28 + ldm sp, {r1, r2, r3, r4, r5} + lsl r5, r5, #4 + orr r5, r5, r4, lsr #28 + lsl r4, r4, #4 + orr r4, r4, r3, lsr #28 + lsl r3, r3, #4 + orr r3, r3, r2, lsr #28 + lsl r2, r2, #4 + orr r2, r2, r1, lsr #28 + bfc r5, #29, #3 + sub sp, sp, #28 + # Sub product of top 8 words and order + # * -5cf5d3ed +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 +#else + mov r1, #0x2c13 +#endif + movt r1, #0xa30a mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1] - str r11, [r1, #4] -#else - strd r10, r11, [r1] -#endif - # Add -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #8] - ldr r5, [r2, #12] -#else - ldrd r4, r5, [r2, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #8] - ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif - adds r12, r12, #-1 - adcs r8, r4, r6 + ldm sp, {r6, r7, r8, r9} + umlal r6, lr, r2, r1 + umaal r7, lr, r3, r1 + umaal r8, lr, r4, r1 + umaal r9, lr, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # * -5812631b +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 +#else + mov r1, #0x9ce5 +#endif + movt r1, #0xa7ed + mov r10, #0 + ldm sp, {r6, r7, r8, r9} + umlal r6, r10, r2, r1 + umaal r7, r10, r3, r1 + umaal r8, r10, r4, r1 + umaal r9, r10, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # * -a2f79cd7 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 +#else + mov r1, #0x6329 +#endif + movt r1, #0x5d08 + mov r11, #0 + ldm sp, {r6, r7, r8, r9} + umlal r6, r11, r2, r1 + umaal r7, r11, r3, r1 + umaal r8, r11, r4, r1 + umaal r9, r11, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # * -14def9df +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 +#else + mov r1, #0x621 +#endif + movt r1, #0xeb21 mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 + ldm sp, {r6, r7, r8, r9} + umlal r6, r12, r2, r1 + umaal r7, r12, r3, r1 + umaal r8, r12, r4, r1 + umaal r9, r12, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # Add overflows at 4 * 32 + ldm sp, {r6, r7, r8, r9} + bfc r9, #28, #4 + adds r6, r6, lr + adcs r7, r7, r10 + adcs r8, r8, r11 + adc r9, r9, r12 + # Subtract top at 4 * 32 + subs r6, r6, r2 + sbcs r7, r7, r3 + sbcs r8, r8, r4 + sbcs r9, r9, r5 + sbc r1, r1, r1 + sub sp, sp, #16 + ldm sp, {r2, r3, r4, r5} #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #8] - str r9, [r0, #12] + mov r10, #0xd3 + lsl r10, r10, #8 + add r10, r10, #0xed #else - strd r8, r9, [r0, #8] + mov r10, #0xd3ed #endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 + movt r10, #0x5cf5 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #8] - str r11, [r1, #12] + mov r11, #0x63 + lsl r11, r11, #8 + add r11, r11, #0x1a #else - strd r10, r11, [r1, #8] + mov r11, #0x631a #endif - # Add + movt r11, #0x5812 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #16] - ldr r5, [r2, #20] + mov r12, #0x9c + lsl r12, r12, #8 + add r12, r12, #0xd6 #else - ldrd r4, r5, [r2, #16] + mov r12, #0x9cd6 #endif + movt r12, #0xa2f7 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #16] - ldr r7, [r0, #20] + mov lr, #0xf9 + lsl lr, lr, #8 + add lr, lr, #0xde #else - ldrd r6, r7, [r0, #16] + mov lr, #0xf9de #endif - adds r12, r12, #-1 - adcs r8, r4, r6 + movt lr, #0x14de + and r10, r10, r1 + and r11, r11, r1 + and r12, r12, r1 + and lr, lr, r1 + adds r2, r2, r10 + adcs r3, r3, r11 + adcs r4, r4, r12 + adcs r5, r5, lr + adcs r6, r6, #0 + adcs r7, r7, #0 + and r1, r1, #0x10000000 + adcs r8, r8, #0 + adc r9, r9, r1 + bfc r9, #28, #4 + # Store result + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + add sp, sp, #52 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size sc_reduce,.-sc_reduce + .text + .align 4 + .globl sc_muladd + .type sc_muladd, %function +sc_muladd: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x50 + add lr, sp, #0x44 + stm lr, {r0, r1, r3} + mov lr, r2 + ldm r1, {r0, r1, r2, r3} + ldm lr!, {r4, r5, r6} + umull r10, r11, r0, r4 + umull r12, r7, r1, r4 + umaal r11, r12, r0, r5 + umull r8, r9, r2, r4 + umaal r12, r8, r1, r5 + umaal r12, r7, r0, r6 + umaal r8, r9, r3, r4 + stm sp, {r10, r11, r12} + umaal r7, r8, r2, r5 + ldm lr!, {r4} + umull r10, r11, r1, r6 + umaal r8, r9, r2, r6 + umaal r7, r10, r0, r4 + umaal r8, r11, r3, r5 + str r7, [sp, #12] + umaal r8, r10, r1, r4 + umaal r9, r11, r3, r6 + umaal r9, r10, r2, r4 + umaal r10, r11, r3, r4 + ldm lr, {r4, r5, r6, r7} mov r12, #0 - adcs r9, r5, r7 - adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #16] - str r9, [r0, #20] -#else - strd r8, r9, [r0, #16] -#endif - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - mov lr, #0 - sbcs r11, r5, r7 - adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #16] - str r11, [r1, #20] + umlal r8, r12, r0, r4 + umaal r9, r12, r1, r4 + umaal r10, r12, r2, r4 + umaal r11, r12, r3, r4 + mov r4, #0 + umlal r9, r4, r0, r5 + umaal r10, r4, r1, r5 + umaal r11, r4, r2, r5 + umaal r12, r4, r3, r5 + mov r5, #0 + umlal r10, r5, r0, r6 + umaal r11, r5, r1, r6 + umaal r12, r5, r2, r6 + umaal r4, r5, r3, r6 + mov r6, #0 + umlal r11, r6, r0, r7 + ldr r0, [sp, #72] + umaal r12, r6, r1, r7 + add r0, r0, #16 + umaal r4, r6, r2, r7 + sub lr, lr, #16 + umaal r5, r6, r3, r7 + ldm r0, {r0, r1, r2, r3} + str r6, [sp, #64] + ldm lr!, {r6} + mov r7, #0 + umlal r8, r7, r0, r6 + umaal r9, r7, r1, r6 + str r8, [sp, #16] + umaal r10, r7, r2, r6 + umaal r11, r7, r3, r6 + ldm lr!, {r6} + mov r8, #0 + umlal r9, r8, r0, r6 + umaal r10, r8, r1, r6 + str r9, [sp, #20] + umaal r11, r8, r2, r6 + umaal r12, r8, r3, r6 + ldm lr!, {r6} + mov r9, #0 + umlal r10, r9, r0, r6 + umaal r11, r9, r1, r6 + str r10, [sp, #24] + umaal r12, r9, r2, r6 + umaal r4, r9, r3, r6 + ldm lr!, {r6} + mov r10, #0 + umlal r11, r10, r0, r6 + umaal r12, r10, r1, r6 + str r11, [sp, #28] + umaal r4, r10, r2, r6 + umaal r5, r10, r3, r6 + ldm lr!, {r11} + umaal r12, r7, r0, r11 + umaal r4, r7, r1, r11 + ldr r6, [sp, #64] + umaal r5, r7, r2, r11 + umaal r6, r7, r3, r11 + ldm lr!, {r11} + umaal r4, r8, r0, r11 + umaal r5, r8, r1, r11 + umaal r6, r8, r2, r11 + umaal r7, r8, r3, r11 + ldm lr, {r11, lr} + umaal r5, r9, r0, r11 + umaal r6, r10, r0, lr + umaal r6, r9, r1, r11 + umaal r7, r10, r1, lr + umaal r7, r9, r2, r11 + umaal r8, r10, r2, lr + umaal r8, r9, r3, r11 + umaal r9, r10, r3, lr + mov r3, r12 + add lr, sp, #32 + stm lr, {r3, r4, r5, r6, r7, r8, r9, r10} + ldr r0, [sp, #68] + # Add c to a * b + ldr lr, [sp, #76] + ldm sp!, {r2, r3, r4, r5, r6, r7, r8, r9} + ldm lr!, {r1, r10, r11, r12} + adds r2, r2, r1 + adcs r3, r3, r10 + adcs r4, r4, r11 + adcs r5, r5, r12 + ldm lr!, {r1, r10, r11, r12} + adcs r6, r6, r1 + adcs r7, r7, r10 + adcs r8, r8, r11 + adcs r9, r9, r12 + mov r1, r9 + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + ldm sp, {r2, r3, r4, r5, r6, r7, r8, r9} + adcs r2, r2, #0 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + sub sp, sp, #32 + # Get 252..503 and 504..507 + lsr lr, r9, #24 + bfc r9, #24, #8 + lsl r9, r9, #4 + orr r9, r9, r8, lsr #28 + lsl r8, r8, #4 + orr r8, r8, r7, lsr #28 + lsl r7, r7, #4 + orr r7, r7, r6, lsr #28 + lsl r6, r6, #4 + orr r6, r6, r5, lsr #28 + lsl r5, r5, #4 + orr r5, r5, r4, lsr #28 + lsl r4, r4, #4 + orr r4, r4, r3, lsr #28 + lsl r3, r3, #4 + orr r3, r3, r2, lsr #28 + lsl r2, r2, #4 + orr r2, r2, r1, lsr #28 + # Add order times bits 504..507 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r10, #0x2c + lsl r10, r10, #8 + add r10, r10, #0x13 +#else + mov r10, #0x2c13 +#endif + movt r10, #0xa30a +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r11, #0x9c + lsl r11, r11, #8 + add r11, r11, #0xe5 #else - strd r10, r11, [r1, #16] + mov r11, #0x9ce5 #endif - # Add + movt r11, #0xa7ed + mov r1, #0 + umlal r2, r1, r10, lr + umaal r3, r1, r11, lr #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r2, #24] - ldr r5, [r2, #28] + mov r10, #0x63 + lsl r10, r10, #8 + add r10, r10, #0x29 #else - ldrd r4, r5, [r2, #24] + mov r10, #0x6329 #endif + movt r10, #0x5d08 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r6, [r0, #24] - ldr r7, [r0, #28] + mov r11, #0x6 + lsl r11, r11, #8 + add r11, r11, #0x21 #else - ldrd r6, r7, [r0, #24] + mov r11, #0x621 #endif - adds r12, r12, #-1 - adcs r8, r4, r6 - adc r9, r5, r7 - # Sub - adds lr, lr, #-1 - sbcs r10, r4, r6 - sbc r11, r5, r7 - mov r12, #-19 - asr r3, r9, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Sub modulus (if overflow) + movt r11, #0xeb21 + umaal r4, r1, r10, lr + umaal r5, r1, r11, lr + adds r6, r6, r1 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + subs r6, r6, lr + sbcs r7, r7, #0 + sbcs r8, r8, #0 + sbc r9, r9, #0 + # Sub product of top 8 words and order #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0] - ldr r5, [r0, #4] + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 #else - ldrd r4, r5, [r0] + mov r1, #0x2c13 #endif - subs r4, r4, r12 - sbcs r5, r5, r3 + movt r1, #0xa30a + mov lr, #0 + ldm r0!, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm r0!, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm r0!, {r10, r11} + umaal r10, lr, r8, r1 + bfc r11, #28, #4 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub r0, r0, #16 + sub sp, sp, #32 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0] - str r5, [r0, #4] + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 #else - strd r4, r5, [r0] + mov r1, #0x9ce5 #endif + movt r1, #0xa7ed + mov lr, #0 + ldm sp, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub sp, sp, #32 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #8] - ldr r5, [r0, #12] + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 #else - ldrd r4, r5, [r0, #8] + mov r1, #0x6329 #endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 + movt r1, #0x5d08 + mov lr, #0 + ldm sp, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub sp, sp, #32 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #8] - str r5, [r0, #12] + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 #else - strd r4, r5, [r0, #8] + mov r1, #0x621 #endif + movt r1, #0xeb21 + mov lr, #0 + ldm sp, {r10, r11, r12} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + umaal r12, lr, r4, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + umaal r10, lr, r5, r1 + umaal r11, lr, r6, r1 + umaal r12, lr, r7, r1 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm sp!, {r10, r11, lr} + sub sp, sp, #32 + # Subtract at 4 * 32 + ldm sp, {r10, r11, r12} + subs r10, r10, r2 + sbcs r11, r11, r3 + sbcs r12, r12, r4 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + sbcs r10, r10, r5 + sbcs r11, r11, r6 + sbcs r12, r12, r7 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11} + sbcs r10, r10, r8 + sbc r11, r11, r9 + stm sp!, {r10, r11} + sub sp, sp, #36 + asr lr, r11, #25 + # Conditionally subtract order starting at bit 125 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r0, #16] - ldr r5, [r0, #20] + mov r1, #0xa00000 + lsl r1, r1, #8 + add r1, r1, #0x0 #else - ldrd r4, r5, [r0, #16] + mov r1, #0xa0000000 #endif - sbcs r4, r4, r3 - sbcs r5, r5, r3 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r0, #16] - str r5, [r0, #20] + mov r2, #0xba + lsl r2, r2, #8 + add r2, r2, #0x7d #else - strd r4, r5, [r0, #16] + mov r2, #0xba7d #endif - sbcs r8, r8, r3 - sbc r9, r9, lr + movt r2, #0x4b9e #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r8, [r0, #24] - str r9, [r0, #28] + mov r3, #0x4c + lsl r3, r3, #8 + add r3, r3, #0x63 #else - strd r8, r9, [r0, #24] + mov r3, #0x4c63 #endif - mov r12, #-19 - asr r3, r11, #31 - # Mask the modulus - and r12, r3, r12 - and lr, r3, #0x7fffffff - # Add modulus (if underflow) + movt r3, #0xcb02 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1] - ldr r5, [r1, #4] + mov r4, #0xf3 + lsl r4, r4, #8 + add r4, r4, #0x9a #else - ldrd r4, r5, [r1] + mov r4, #0xf39a #endif - adds r4, r4, r12 - adcs r5, r5, r3 + movt r4, #0xd45e #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1] - str r5, [r1, #4] + mov r5, #0xdf + lsl r5, r5, #8 + add r5, r5, #0x3b #else - strd r4, r5, [r1] + mov r5, #0xdf3b #endif + movt r5, #0x29b #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #8] - ldr r5, [r1, #12] + mov r9, #0x20000 + lsl r9, r9, #8 + add r9, r9, #0x0 #else - ldrd r4, r5, [r1, #8] + mov r9, #0x2000000 #endif - adcs r4, r4, r3 - adcs r5, r5, r3 + and r1, r1, lr + and r2, r2, lr + and r3, r3, lr + and r4, r4, lr + and r5, r5, lr + and r9, r9, lr + ldm sp, {r10, r11, r12} + adds r10, r10, r1 + adcs r11, r11, r2 + adcs r12, r12, r3 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + adcs r10, r10, r4 + adcs r11, r11, r5 + adcs r12, r12, #0 + stm sp!, {r10, r11, r12} + ldm sp, {r10, r11, r12} + adcs r10, r10, #0 + adcs r11, r11, #0 + adcs r12, r12, r9 + stm sp!, {r10, r11, r12} + sub sp, sp, #48 + sub r0, r0, #16 + # Load bits 252-376 + add sp, sp, #28 + ldm sp, {r1, r2, r3, r4, r5} + lsl r5, r5, #4 + orr r5, r5, r4, lsr #28 + lsl r4, r4, #4 + orr r4, r4, r3, lsr #28 + lsl r3, r3, #4 + orr r3, r3, r2, lsr #28 + lsl r2, r2, #4 + orr r2, r2, r1, lsr #28 + bfc r5, #29, #3 + sub sp, sp, #28 + # Sub product of top 8 words and order + # * -5cf5d3ed +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 +#else + mov r1, #0x2c13 +#endif + movt r1, #0xa30a + mov lr, #0 + ldm sp, {r6, r7, r8, r9} + umlal r6, lr, r2, r1 + umaal r7, lr, r3, r1 + umaal r8, lr, r4, r1 + umaal r9, lr, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # * -5812631b +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 +#else + mov r1, #0x9ce5 +#endif + movt r1, #0xa7ed + mov r10, #0 + ldm sp, {r6, r7, r8, r9} + umlal r6, r10, r2, r1 + umaal r7, r10, r3, r1 + umaal r8, r10, r4, r1 + umaal r9, r10, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # * -a2f79cd7 +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 +#else + mov r1, #0x6329 +#endif + movt r1, #0x5d08 + mov r11, #0 + ldm sp, {r6, r7, r8, r9} + umlal r6, r11, r2, r1 + umaal r7, r11, r3, r1 + umaal r8, r11, r4, r1 + umaal r9, r11, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # * -14def9df +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 +#else + mov r1, #0x621 +#endif + movt r1, #0xeb21 + mov r12, #0 + ldm sp, {r6, r7, r8, r9} + umlal r6, r12, r2, r1 + umaal r7, r12, r3, r1 + umaal r8, r12, r4, r1 + umaal r9, r12, r5, r1 + stm sp, {r6, r7, r8, r9} + add sp, sp, #4 + # Add overflows at 4 * 32 + ldm sp, {r6, r7, r8, r9} + bfc r9, #28, #4 + adds r6, r6, lr + adcs r7, r7, r10 + adcs r8, r8, r11 + adc r9, r9, r12 + # Subtract top at 4 * 32 + subs r6, r6, r2 + sbcs r7, r7, r3 + sbcs r8, r8, r4 + sbcs r9, r9, r5 + sbc r1, r1, r1 + sub sp, sp, #16 + ldm sp, {r2, r3, r4, r5} #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #8] - str r5, [r1, #12] + mov r10, #0xd3 + lsl r10, r10, #8 + add r10, r10, #0xed #else - strd r4, r5, [r1, #8] + mov r10, #0xd3ed #endif + movt r10, #0x5cf5 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - ldr r4, [r1, #16] - ldr r5, [r1, #20] + mov r11, #0x63 + lsl r11, r11, #8 + add r11, r11, #0x1a #else - ldrd r4, r5, [r1, #16] + mov r11, #0x631a #endif - adcs r4, r4, r3 - adcs r5, r5, r3 + movt r11, #0x5812 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r4, [r1, #16] - str r5, [r1, #20] + mov r12, #0x9c + lsl r12, r12, #8 + add r12, r12, #0xd6 #else - strd r4, r5, [r1, #16] + mov r12, #0x9cd6 #endif - adcs r10, r10, r3 - adc r11, r11, lr + movt r12, #0xa2f7 #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - str r10, [r1, #24] - str r11, [r1, #28] + mov lr, #0xf9 + lsl lr, lr, #8 + add lr, lr, #0xde #else - strd r10, r11, [r1, #24] + mov lr, #0xf9de #endif - add sp, sp, #0x60 + movt lr, #0x14de + and r10, r10, r1 + and r11, r11, r1 + and r12, r12, r1 + and lr, lr, r1 + adds r2, r2, r10 + adcs r3, r3, r11 + adcs r4, r4, r12 + adcs r5, r5, lr + adcs r6, r6, #0 + adcs r7, r7, #0 + and r1, r1, #0x10000000 + adcs r8, r8, #0 + adc r9, r9, r1 + bfc r9, #28, #4 + # Store result + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + add sp, sp, #0x50 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size fe_ge_sub,.-fe_ge_sub + .size sc_muladd,.-sc_muladd +#endif /* HAVE_ED25519 */ -#endif /* HAVE_CURVE25519 */ -#endif /* !__aarch64__ */ +#endif /* !CURVE25519_SMALL || !ED25519_SMALL */ +#endif /* HAVE_CURVE25519 || HAVE_ED25519 */ +#endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 4862f759d9..403c8c5ef7 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -30,102 +30,372 @@ #include #ifdef WOLFSSL_ARMASM -#ifndef __aarch64__ +#if !defined(__aarch64__) && defined(__arm__) #include #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include +#ifdef WOLFSSL_ARMASM_INLINE +/* Based on work by: Emil Lenngren + * https://github.com/pornin/X25519-Cortex-M4 + */ + #include +#define CURVED25519_ASM +#include -#ifdef HAVE_CURVE25519 +#if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) +#if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) void fe_init() { __asm__ __volatile__ ( "\n\t" - : + : : : "memory" ); } -void fe_frombytes(fe out_p, const unsigned char* in_p) +void fe_add_sub_op(void); +void fe_add_sub_op() { - register fe out asm ("r0") = out_p; - register const unsigned char* in asm ("r1") = in_p; __asm__ __volatile__ ( - "ldr r2, [%[in]]\n\t" - "ldr r3, [%[in], #4]\n\t" - "ldr r4, [%[in], #8]\n\t" - "ldr r5, [%[in], #12]\n\t" - "ldr r6, [%[in], #16]\n\t" - "ldr r7, [%[in], #20]\n\t" - "ldr r8, [%[in], #24]\n\t" - "ldr r9, [%[in], #28]\n\t" - "and r9, r9, #0x7fffffff\n\t" + /* Add-Sub */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[out]]\n\t" - "str r3, [%[out], #4]\n\t" + "ldr r4, [r2]\n\t" + "ldr r5, [r2, #4]\n\t" #else - "strd r2, r3, [%[out]]\n\t" + "ldrd r4, r5, [r2]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[out], #8]\n\t" - "str r5, [%[out], #12]\n\t" + "ldr r6, [r3]\n\t" + "ldr r7, [r3, #4]\n\t" #else - "strd r4, r5, [%[out], #8]\n\t" + "ldrd r6, r7, [r3]\n\t" #endif + /* Add */ + "adds r8, r4, r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[out], #16]\n\t" - "str r7, [%[out], #20]\n\t" + "str r8, [r0]\n\t" + "str r9, [r0, #4]\n\t" #else - "strd r6, r7, [%[out], #16]\n\t" + "strd r8, r9, [r0]\n\t" #endif + /* Sub */ + "subs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[out], #24]\n\t" - "str r9, [%[out], #28]\n\t" + "str r10, [r1]\n\t" + "str r11, [r1, #4]\n\t" #else - "strd r8, r9, [%[out], #24]\n\t" + "strd r10, r11, [r1]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [r2, #8]\n\t" + "ldr r5, [r2, #12]\n\t" +#else + "ldrd r4, r5, [r2, #8]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [r3, #8]\n\t" + "ldr r7, [r3, #12]\n\t" +#else + "ldrd r6, r7, [r3, #8]\n\t" +#endif + /* Sub */ + "sbcs r10, r4, r6\n\t" + "mov lr, #0\n\t" + "sbcs r11, r5, r7\n\t" + "adc lr, lr, #0\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r10, [r1, #8]\n\t" + "str r11, [r1, #12]\n\t" +#else + "strd r10, r11, [r1, #8]\n\t" +#endif + /* Add */ + "subs r12, r12, #1\n\t" + "adcs r8, r4, r6\n\t" + "adcs r9, r5, r7\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r8, [r0, #8]\n\t" + "str r9, [r0, #12]\n\t" +#else + "strd r8, r9, [r0, #8]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [r2, #16]\n\t" + "ldr r5, [r2, #20]\n\t" +#else + "ldrd r4, r5, [r2, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [r3, #16]\n\t" + "ldr r7, [r3, #20]\n\t" +#else + "ldrd r6, r7, [r3, #16]\n\t" +#endif + /* Add */ + "adcs r8, r4, r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r8, [r0, #16]\n\t" + "str r9, [r0, #20]\n\t" +#else + "strd r8, r9, [r0, #16]\n\t" +#endif + /* Sub */ + "subs lr, lr, #1\n\t" + "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r10, [r1, #16]\n\t" + "str r11, [r1, #20]\n\t" +#else + "strd r10, r11, [r1, #16]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [r2, #24]\n\t" + "ldr r5, [r2, #28]\n\t" +#else + "ldrd r4, r5, [r2, #24]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [r3, #24]\n\t" + "ldr r7, [r3, #28]\n\t" +#else + "ldrd r6, r7, [r3, #24]\n\t" +#endif + /* Sub */ + "sbcs r10, r4, r6\n\t" + "sbcs r11, r5, r7\n\t" + "sbc lr, lr, lr\n\t" + /* Add */ + "subs r12, r12, #1\n\t" + "adcs r8, r4, r6\n\t" + "mov r12, #0\n\t" + "adcs r9, r5, r7\n\t" + "adc r12, r12, #0\n\t" + /* Multiply -modulus by overflow */ + "lsl r3, r12, #1\n\t" + "mov r12, #19\n\t" + "orr r3, r3, r9, lsr #31\n\t" + "mul r12, r3, r12\n\t" + /* Add -x*modulus (if overflow) */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r4, [r0]\n\t" + "ldr r5, [r0, #4]\n\t" +#else + "ldrd r4, r5, [r0]\n\t" +#endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [r0, #8]\n\t" + "ldr r7, [r0, #12]\n\t" +#else + "ldrd r6, r7, [r0, #8]\n\t" +#endif + "adds r4, r4, r12\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r4, [r0]\n\t" + "str r5, [r0, #4]\n\t" +#else + "strd r4, r5, [r0]\n\t" #endif - : [out] "+r" (out), [in] "+r" (in) - : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" - ); -} - -void fe_tobytes(unsigned char* out_p, const fe n_p) -{ - register unsigned char* out asm ("r0") = out_p; - register const fe n asm ("r1") = n_p; - - __asm__ __volatile__ ( #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r2, [%[n]]\n\t" - "ldr r3, [%[n], #4]\n\t" + "str r6, [r0, #8]\n\t" + "str r7, [r0, #12]\n\t" #else - "ldrd r2, r3, [%[n]]\n\t" + "strd r6, r7, [r0, #8]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[n], #8]\n\t" - "ldr r5, [%[n], #12]\n\t" + "ldr r4, [r0, #16]\n\t" + "ldr r5, [r0, #20]\n\t" #else - "ldrd r4, r5, [%[n], #8]\n\t" + "ldrd r4, r5, [r0, #16]\n\t" #endif + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[n], #16]\n\t" - "ldr r7, [%[n], #20]\n\t" + "str r4, [r0, #16]\n\t" + "str r5, [r0, #20]\n\t" #else - "ldrd r6, r7, [%[n], #16]\n\t" + "strd r4, r5, [r0, #16]\n\t" #endif + "bfc r9, #31, #1\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [%[n], #24]\n\t" - "ldr r9, [%[n], #28]\n\t" + "str r8, [r0, #24]\n\t" + "str r9, [r0, #28]\n\t" #else - "ldrd r8, r9, [%[n], #24]\n\t" + "strd r8, r9, [r0, #24]\n\t" #endif + /* Multiply -modulus by underflow */ + "lsl r3, lr, #1\n\t" + "mvn lr, #18\n\t" + "orr r3, r3, r11, lsr #31\n\t" + "mul lr, r3, lr\n\t" + /* Sub -x*modulus (if overflow) */ + "ldm r1, {r4, r5, r6, r7, r8, r9}\n\t" + "subs r4, r4, lr\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "bfc r11, #31, #1\n\t" + "sbcs r10, r10, #0\n\t" + "sbc r11, r11, #0\n\t" + "stm r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Add-Sub */ + : + : + : "memory", "lr" + ); +} + +void fe_sub_op(void); +void fe_sub_op() +{ + + __asm__ __volatile__ ( + /* Sub */ + "ldm r2!, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + "ldm r1!, {r2, r3, r4, r5}\n\t" + "subs r6, r2, r6\n\t" + "sbcs r7, r3, r7\n\t" + "sbcs r8, r4, r8\n\t" + "sbcs r9, r5, r9\n\t" + "ldm r1!, {r2, r3, r4, r5}\n\t" + "sbcs r10, r2, r10\n\t" + "sbcs r11, r3, r11\n\t" + "sbcs r12, r4, r12\n\t" + "sbcs lr, r5, lr\n\t" + "sbc r3, r3, r3\n\t" + "mvn r2, #18\n\t" + "lsl r3, r3, #1\n\t" + "orr r3, r3, lr, lsr #31\n\t" + "mul r2, r3, r2\n\t" + "subs r6, r6, r2\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, #0\n\t" + "sbcs r11, r11, #0\n\t" + "bfc lr, #31, #1\n\t" + "sbcs r12, r12, #0\n\t" + "sbc lr, lr, #0\n\t" + "stm r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + /* Done Sub */ + : + : + : "memory", "lr" + ); +} + +void fe_sub(fe r_p, const fe a_p, const fe b_p) +{ + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + register const fe b asm ("r2") = b_p; + + __asm__ __volatile__ ( + "bl fe_sub_op\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void fe_add_op(void); +void fe_add_op() +{ + + __asm__ __volatile__ ( + /* Add */ + "ldm r2!, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + "ldm r1!, {r2, r3, r4, r5}\n\t" + "adds r6, r2, r6\n\t" + "adcs r7, r3, r7\n\t" + "adcs r8, r4, r8\n\t" + "adcs r9, r5, r9\n\t" + "ldm r1!, {r2, r3, r4, r5}\n\t" + "adcs r10, r2, r10\n\t" + "adcs r11, r3, r11\n\t" + "adcs r12, r4, r12\n\t" + "mov r3, #0\n\t" + "adcs lr, r5, lr\n\t" + "adc r3, r3, #0\n\t" + "mov r2, #19\n\t" + "lsl r3, r3, #1\n\t" + "orr r3, r3, lr, lsr #31\n\t" + "mul r2, r3, r2\n\t" + "adds r6, r6, r2\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "bfc lr, #31, #1\n\t" + "adcs r12, r12, #0\n\t" + "adc lr, lr, #0\n\t" + "stm r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + /* Done Add */ + : + : + : "memory", "lr" + ); +} + +void fe_add(fe r_p, const fe a_p, const fe b_p) +{ + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + register const fe b asm ("r2") = b_p; + + __asm__ __volatile__ ( + "bl fe_add_op\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#ifdef HAVE_ED25519 +void fe_frombytes(fe out_p, const unsigned char* in_p) +{ + register fe out asm ("r0") = out_p; + register const unsigned char* in asm ("r1") = in_p; + + __asm__ __volatile__ ( + "ldm %[in], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "bfc r9, #31, #1\n\t" + "stm %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + : [out] "+r" (out), [in] "+r" (in) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + ); +} + +void fe_tobytes(unsigned char* out_p, const fe n_p) +{ + register unsigned char* out asm ("r0") = out_p; + register const fe n asm ("r1") = n_p; + + __asm__ __volatile__ ( + "ldm %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" "adds r12, r2, #19\n\t" "adcs r12, r3, #0\n\t" "adcs r12, r4, #0\n\t" @@ -144,31 +414,8 @@ void fe_tobytes(unsigned char* out_p, const fe n_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" - "and r9, r9, #0x7fffffff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[out]]\n\t" - "str r3, [%[out], #4]\n\t" -#else - "strd r2, r3, [%[out]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[out], #8]\n\t" - "str r5, [%[out], #12]\n\t" -#else - "strd r4, r5, [%[out], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[out], #16]\n\t" - "str r7, [%[out], #20]\n\t" -#else - "strd r6, r7, [%[out], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[out], #24]\n\t" - "str r9, [%[out], #28]\n\t" -#else - "strd r8, r9, [%[out], #24]\n\t" -#endif + "bfc r9, #31, #1\n\t" + "stm %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" : [out] "+r" (out), [n] "+r" (n) : : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12" @@ -313,341 +560,27 @@ void fe_copy(fe r_p, const fe a_p) ); } -void fe_sub(fe r_p, const fe a_p, const fe b_p) +void fe_neg(fe r_p, const fe a_p) { register fe r asm ("r0") = r_p; register const fe a asm ("r1") = a_p; - register const fe b asm ("r2") = b_p; __asm__ __volatile__ ( - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[a], #4]\n\t" -#else - "ldrd r4, r5, [%[a]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #12]\n\t" -#else - "ldrd r6, r7, [%[a], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [%[b]]\n\t" - "ldr r9, [%[b], #4]\n\t" -#else - "ldrd r8, r9, [%[b]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [%[b], #8]\n\t" - "ldr r11, [%[b], #12]\n\t" -#else - "ldrd r10, r11, [%[b], #8]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r]]\n\t" - "str r9, [%[r], #4]\n\t" -#else - "strd r8, r9, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #8]\n\t" - "str r11, [%[r], #12]\n\t" -#else - "strd r10, r11, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" -#else - "ldrd r4, r5, [%[a], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" -#else - "ldrd r6, r7, [%[a], #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [%[b], #16]\n\t" - "ldr r9, [%[b], #20]\n\t" -#else - "ldrd r8, r9, [%[b], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [%[b], #24]\n\t" - "ldr r11, [%[b], #28]\n\t" -#else - "ldrd r10, r11, [%[b], #24]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr r3, r11, #31\n\t" - /* Mask the modulus */ - "and r12, r3, r12\n\t" - "and lr, r3, #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r]]\n\t" - "ldr r5, [%[r], #4]\n\t" -#else - "ldrd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[r], #8]\n\t" - "ldr r7, [%[r], #12]\n\t" -#else - "ldrd r6, r7, [%[r], #8]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, r3\n\t" - "adcs r6, r6, r3\n\t" - "adcs r7, r7, r3\n\t" - "adcs r8, r8, r3\n\t" - "adcs r9, r9, r3\n\t" - "adcs r10, r10, r3\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" -#else - "strd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" -#else - "strd r6, r7, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" -#else - "strd r8, r9, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" -#else - "strd r10, r11, [%[r], #24]\n\t" -#endif - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" - ); -} - -void fe_add(fe r_p, const fe a_p, const fe b_p) -{ - register fe r asm ("r0") = r_p; - register const fe a asm ("r1") = a_p; - register const fe b asm ("r2") = b_p; - - __asm__ __volatile__ ( - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[a], #4]\n\t" -#else - "ldrd r4, r5, [%[a]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #12]\n\t" -#else - "ldrd r6, r7, [%[a], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [%[b]]\n\t" - "ldr r9, [%[b], #4]\n\t" -#else - "ldrd r8, r9, [%[b]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [%[b], #8]\n\t" - "ldr r11, [%[b], #12]\n\t" -#else - "ldrd r10, r11, [%[b], #8]\n\t" -#endif - "adds r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r]]\n\t" - "str r9, [%[r], #4]\n\t" -#else - "strd r8, r9, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #8]\n\t" - "str r11, [%[r], #12]\n\t" -#else - "strd r10, r11, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" -#else - "ldrd r4, r5, [%[a], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" -#else - "ldrd r6, r7, [%[a], #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [%[b], #16]\n\t" - "ldr r9, [%[b], #20]\n\t" -#else - "ldrd r8, r9, [%[b], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [%[b], #24]\n\t" - "ldr r11, [%[b], #28]\n\t" -#else - "ldrd r10, r11, [%[b], #24]\n\t" -#endif - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr r3, r11, #31\n\t" - /* Mask the modulus */ - "and r12, r3, r12\n\t" - "and lr, r3, #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r]]\n\t" - "ldr r5, [%[r], #4]\n\t" -#else - "ldrd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[r], #8]\n\t" - "ldr r7, [%[r], #12]\n\t" -#else - "ldrd r6, r7, [%[r], #8]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, r3\n\t" - "sbcs r6, r6, r3\n\t" - "sbcs r7, r7, r3\n\t" - "sbcs r8, r8, r3\n\t" - "sbcs r9, r9, r3\n\t" - "sbcs r10, r10, r3\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" -#else - "strd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" -#else - "strd r6, r7, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" -#else - "strd r8, r9, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" -#else - "strd r10, r11, [%[r], #24]\n\t" -#endif - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" - ); -} - -void fe_neg(fe r_p, const fe a_p) -{ - register fe r asm ("r0") = r_p; - register const fe a asm ("r1") = a_p; - - __asm__ __volatile__ ( - "mov lr, #-1\n\t" - "mov r12, #-19\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#else - "ldrd r2, r3, [%[a]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" -#else - "ldrd r4, r5, [%[a], #8]\n\t" -#endif + "mvn lr, #0\n\t" + "mvn r12, #18\n\t" + "ldm %[a]!, {r2, r3, r4, r5}\n\t" "subs r2, r12, r2\n\t" "sbcs r3, lr, r3\n\t" "sbcs r4, lr, r4\n\t" "sbcs r5, lr, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[r]]\n\t" - "str r3, [%[r], #4]\n\t" -#else - "strd r2, r3, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" -#else - "strd r4, r5, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "mov r12, #0x7fffff\n\t" - "lsl r12, r12, #8\n\t" - "add r12, r12, #0xff\n\t" -#else - "mov r12, #0x7fffffff\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r2, [%[a], #16]\n\t" - "ldr r3, [%[a], #20]\n\t" -#else - "ldrd r2, r3, [%[a], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[a], #28]\n\t" -#else - "ldrd r4, r5, [%[a], #24]\n\t" -#endif + "stm %[r]!, {r2, r3, r4, r5}\n\t" + "mvn r12, #0x80000000\n\t" + "ldm %[a]!, {r2, r3, r4, r5}\n\t" "sbcs r2, lr, r2\n\t" "sbcs r3, lr, r3\n\t" "sbcs r4, lr, r4\n\t" "sbc r5, r12, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[r], #16]\n\t" - "str r3, [%[r], #20]\n\t" -#else - "strd r2, r3, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #28]\n\t" -#else - "strd r4, r5, [%[r], #24]\n\t" -#endif + "stm %[r]!, {r2, r3, r4, r5}\n\t" : [r] "+r" (r), [a] "+r" (a) : : "memory", "r2", "r3", "r4", "r5", "r12", "lr" @@ -659,30 +592,7 @@ int fe_isnonzero(const fe a_p) register const fe a asm ("r0") = a_p; __asm__ __volatile__ ( -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#else - "ldrd r2, r3, [%[a]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" -#else - "ldrd r4, r5, [%[a], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[a], #20]\n\t" -#else - "ldrd r6, r7, [%[a], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [%[a], #24]\n\t" - "ldr r9, [%[a], #28]\n\t" -#else - "ldrd r8, r9, [%[a], #24]\n\t" -#endif + "ldm %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" "adds r1, r2, #19\n\t" "adcs r1, r3, #0\n\t" "adcs r1, r4, #0\n\t" @@ -701,7 +611,7 @@ int fe_isnonzero(const fe a_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" - "and r9, r9, #0x7fffffff\n\t" + "bfc r9, #31, #1\n\t" "orr r2, r2, r3\n\t" "orr r4, r4, r5\n\t" "orr r6, r6, r7\n\t" @@ -721,38 +631,16 @@ int fe_isnegative(const fe a_p) register const fe a asm ("r0") = a_p; __asm__ __volatile__ ( -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" -#else - "ldrd r2, r3, [%[a]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" -#else - "ldrd r4, r5, [%[a], #8]\n\t" -#endif + "ldm %[a]!, {r2, r3, r4, r5}\n\t" "adds r1, r2, #19\n\t" "adcs r1, r3, #0\n\t" "adcs r1, r4, #0\n\t" "adcs r1, r5, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r2, [%[a], #16]\n\t" - "ldr r3, [%[a], #20]\n\t" -#else - "ldrd r2, r3, [%[a], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[a], #28]\n\t" -#else - "ldrd r4, r5, [%[a], #24]\n\t" -#endif + "ldm %[a], {r2, r3, r4, r5}\n\t" "adcs r1, r2, #0\n\t" "adcs r1, r3, #0\n\t" "adcs r1, r4, #0\n\t" - "ldr r2, [%[a]]\n\t" + "ldr r2, [%[a], #-16]\n\t" "adc r1, r5, #0\n\t" "and %[a], r2, #1\n\t" "lsr r1, r1, #31\n\t" @@ -764,6 +652,7 @@ int fe_isnegative(const fe a_p) return (uint32_t)(size_t)a; } +#ifndef WC_NO_CACHE_RESISTANT void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) { register fe* r asm ("r0") = r_p; @@ -1157,8 +1046,8 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "sub %[base], %[base], #0x2a0\n\t" - "mov r10, #-19\n\t" - "mov r11, #-1\n\t" + "mvn r10, #18\n\t" + "mvn r11, #0\n\t" "subs r10, r10, r8\n\t" "sbcs r11, r11, r9\n\t" "sbc lr, lr, lr\n\t" @@ -1580,8 +1469,8 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "sub %[base], %[base], #0x2a0\n\t" - "mov r10, #-1\n\t" - "mov r11, #-1\n\t" + "mvn r10, #0\n\t" + "mvn r11, #0\n\t" "rsbs lr, lr, #0\n\t" "sbcs r10, r10, r8\n\t" "sbcs r11, r11, r9\n\t" @@ -2004,8 +1893,8 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "sub %[base], %[base], #0x2a0\n\t" - "mov r10, #-1\n\t" - "mov r11, #-1\n\t" + "mvn r10, #0\n\t" + "mvn r11, #0\n\t" "rsbs lr, lr, #0\n\t" "sbcs r10, r10, r8\n\t" "sbcs r11, r11, r9\n\t" @@ -2428,14 +2317,8 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "sub %[base], %[base], #0x2a0\n\t" - "mov r10, #-1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "mov r11, #0x7fffff\n\t" - "lsl r11, r11, #8\n\t" - "add r11, r11, #0xff\n\t" -#else - "mov r11, #0x7fffffff\n\t" -#endif + "mvn r10, #0\n\t" + "mvn r11, #0x80000000\n\t" "rsbs lr, lr, #0\n\t" "sbcs r10, r10, r8\n\t" "sbc r11, r11, r9\n\t" @@ -2478,566 +2361,485 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) ); } -void fe_mul(fe r_p, const fe a_p, const fe b_p) +#else +void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) { - register fe r asm ("r0") = r_p; - register const fe a asm ("r1") = a_p; - register const fe b asm ("r2") = b_p; - + register fe* r asm ("r0") = r_p; + register fe* base asm ("r1") = base_p; + register signed char b asm ("r2") = b_p; + __asm__ __volatile__ ( - "sub sp, sp, #0x40\n\t" - /* Multiply */ - "ldr r7, [%[a]]\n\t" - "ldr r8, [%[a], #4]\n\t" - "ldr r9, [%[b]]\n\t" - "ldr lr, [%[b], #4]\n\t" - /* A[0] * B[0] = 0 */ - "umull r4, r5, r7, r9\n\t" - "str r4, [sp]\n\t" - /* A[0] * B[1] = 1 */ - "umull r3, r6, r7, lr\n\t" - "adds r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * B[0] = 1 */ - "umull r3, r12, r8, r9\n\t" - "adds r5, r5, r3\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #4]\n\t" - /* A[2] * B[0] = 2 */ - "ldr r10, [%[a], #8]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r6, r6, r3\n\t" - "adc r4, r4, r12\n\t" - /* A[1] * B[1] = 2 */ - "umull r3, r12, r8, lr\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[0] * B[2] = 2 */ - "ldr r11, [%[b], #8]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #8]\n\t" - /* A[0] * B[3] = 3 */ - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r4, r4, r3\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * B[2] = 3 */ - "ldr r11, [%[b], #8]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * B[1] = 3 */ - "umull r3, r12, r10, lr\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * B[0] = 3 */ - "ldr r10, [%[a], #12]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #12]\n\t" - /* A[4] * B[0] = 4 */ - "ldr r10, [%[a], #16]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r5, r5, r3\n\t" + "sxtb %[b], %[b]\n\t" + "sbfx r3, %[b], #7, #1\n\t" + "eor %[b], %[b], r3\n\t" + "sub %[b], %[b], r3\n\t" + "clz lr, %[b]\n\t" + "lsl lr, lr, #26\n\t" + "asr lr, lr, #31\n\t" + "mvn lr, lr\n\t" + "add %[b], %[b], lr\n\t" + "mov r12, #0x60\n\t" + "mul %[b], %[b], r12\n\t" + "add %[base], %[base], %[b]\n\t" + "ldm %[base]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r6, r6, lr\n\t" + "and r7, r7, lr\n\t" + "and r8, r8, lr\n\t" + "and r9, r9, lr\n\t" + "and r10, r10, lr\n\t" + "and r11, r11, lr\n\t" + "mvn r12, lr\n\t" + "sub r4, r4, r12\n\t" + "mov r12, #32\n\t" + "and r12, r12, r3\n\t" + "add %[r], %[r], r12\n\t" + "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "sub %[r], %[r], r12\n\t" + "ldm %[base]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r6, r6, lr\n\t" + "and r7, r7, lr\n\t" + "and r8, r8, lr\n\t" + "and r9, r9, lr\n\t" + "and r10, r10, lr\n\t" + "and r11, r11, lr\n\t" + "mvn r12, lr\n\t" + "sub r4, r4, r12\n\t" + "mov r12, #32\n\t" + "bic r12, r12, r3\n\t" + "add %[r], %[r], r12\n\t" + "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "sub %[r], %[r], r12\n\t" + "add %[r], %[r], #0x40\n\t" + "ldm %[base]!, {r4, r5, r6, r7}\n\t" + "mvn r12, #18\n\t" + "subs r8, r12, r4\n\t" + "sbcs r9, r3, r5\n\t" + "sbcs r10, r3, r6\n\t" + "sbcs r11, r3, r7\n\t" + "bic r4, r4, r3\n\t" + "bic r5, r5, r3\n\t" + "bic r6, r6, r3\n\t" + "bic r7, r7, r3\n\t" + "and r8, r8, r3\n\t" + "and r9, r9, r3\n\t" + "and r10, r10, r3\n\t" + "and r11, r11, r3\n\t" + "orr r4, r4, r8\n\t" + "orr r5, r5, r9\n\t" + "orr r6, r6, r10\n\t" + "orr r7, r7, r11\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r6, r6, lr\n\t" + "and r7, r7, lr\n\t" + "stm %[r]!, {r4, r5, r6, r7}\n\t" + "ldm %[base]!, {r4, r5, r6, r7}\n\t" + "mvn r12, #0x80000000\n\t" + "sbcs r8, r3, r4\n\t" + "sbcs r9, r3, r5\n\t" + "sbcs r10, r3, r6\n\t" + "sbc r11, r12, r7\n\t" + "bic r4, r4, r3\n\t" + "bic r5, r5, r3\n\t" + "bic r6, r6, r3\n\t" + "bic r7, r7, r3\n\t" + "and r8, r8, r3\n\t" + "and r9, r9, r3\n\t" + "and r10, r10, r3\n\t" + "and r11, r11, r3\n\t" + "orr r4, r4, r8\n\t" + "orr r5, r5, r9\n\t" + "orr r6, r6, r10\n\t" + "orr r7, r7, r11\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r6, r6, lr\n\t" + "and r7, r7, lr\n\t" + "stm %[r]!, {r4, r5, r6, r7}\n\t" + "sub %[base], %[base], %[b]\n\t" + : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_ED25519 */ +void fe_mul_op(void); +void fe_mul_op() +{ + + __asm__ __volatile__ ( + "sub sp, sp, #44\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "str r0, [sp, #36]\n\t" + "str r1, [sp, #40]\n\t" +#else + "strd r0, r1, [sp, #36]\n\t" +#endif + "mov lr, r2\n\t" + "ldm r1, {r0, r1, r2, r3}\n\t" + "ldm lr!, {r4, r5, r6}\n\t" + "umull r10, r11, r0, r4\n\t" + "umull r12, r7, r1, r4\n\t" + "umaal r11, r12, r0, r5\n\t" + "umull r8, r9, r2, r4\n\t" + "umaal r12, r8, r1, r5\n\t" + "umaal r12, r7, r0, r6\n\t" + "umaal r8, r9, r3, r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + "umaal r7, r8, r2, r5\n\t" + "ldm lr!, {r4}\n\t" + "umull r10, r11, r1, r6\n\t" + "umaal r8, r9, r2, r6\n\t" + "umaal r7, r10, r0, r4\n\t" + "umaal r8, r11, r3, r5\n\t" + "str r7, [sp, #12]\n\t" + "umaal r8, r10, r1, r4\n\t" + "umaal r9, r11, r3, r6\n\t" + "umaal r9, r10, r2, r4\n\t" + "umaal r10, r11, r3, r4\n\t" + "ldm lr, {r4, r5, r6, r7}\n\t" + "mov r12, #0\n\t" + "umlal r8, r12, r0, r4\n\t" + "umaal r9, r12, r1, r4\n\t" + "umaal r10, r12, r2, r4\n\t" + "umaal r11, r12, r3, r4\n\t" "mov r4, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[1] = 4 */ - "ldr r10, [%[a], #12]\n\t" - "umull r3, r12, r10, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[2] = 4 */ - "ldr r10, [%[a], #8]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[3] = 4 */ - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[0] * B[4] = 4 */ - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #16]\n\t" - /* A[0] * B[5] = 5 */ - "ldr r11, [%[b], #20]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r6, r6, r3\n\t" + "umlal r9, r4, r0, r5\n\t" + "umaal r10, r4, r1, r5\n\t" + "umaal r11, r4, r2, r5\n\t" + "umaal r12, r4, r3, r5\n\t" "mov r5, #0\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[4] = 5 */ - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[3] = 5 */ - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[2] = 5 */ - "ldr r10, [%[a], #12]\n\t" - "ldr r11, [%[b], #8]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[1] = 5 */ - "ldr r10, [%[a], #16]\n\t" - "umull r3, r12, r10, lr\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[0] = 5 */ - "ldr r10, [%[a], #20]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #20]\n\t" - /* A[6] * B[0] = 6 */ - "ldr r10, [%[a], #24]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r4, r4, r3\n\t" + "umlal r10, r5, r0, r6\n\t" + "umaal r11, r5, r1, r6\n\t" + "umaal r12, r5, r2, r6\n\t" + "umaal r4, r5, r3, r6\n\t" "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * B[1] = 6 */ - "ldr r10, [%[a], #20]\n\t" - "umull r3, r12, r10, lr\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[4] * B[2] = 6 */ - "ldr r10, [%[a], #16]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * B[3] = 6 */ - "ldr r10, [%[a], #12]\n\t" - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * B[4] = 6 */ - "ldr r10, [%[a], #8]\n\t" - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * B[5] = 6 */ - "ldr r11, [%[b], #20]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[0] * B[6] = 6 */ - "ldr r11, [%[b], #24]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #24]\n\t" - /* A[0] * B[7] = 7 */ - "ldr r11, [%[b], #28]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r5, r5, r3\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[6] = 7 */ - "ldr r11, [%[b], #24]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[5] = 7 */ - "ldr r11, [%[b], #20]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[4] = 7 */ - "ldr r10, [%[a], #12]\n\t" - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[3] = 7 */ - "ldr r10, [%[a], #16]\n\t" - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[2] = 7 */ - "ldr r10, [%[a], #20]\n\t" - "ldr r11, [%[b], #8]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[1] = 7 */ - "ldr r10, [%[a], #24]\n\t" - "umull r3, r12, r10, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[0] = 7 */ - "ldr r10, [%[a], #28]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #28]\n\t" - "ldr r7, [%[a], #24]\n\t" - "ldr r9, [%[b], #24]\n\t" - /* A[7] * B[1] = 8 */ - "umull r3, r12, r10, lr\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[2] = 8 */ - "umull r3, r12, r7, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[3] = 8 */ - "ldr r10, [%[a], #20]\n\t" - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[4] = 8 */ - "ldr r10, [%[a], #16]\n\t" - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[5] = 8 */ - "ldr r10, [%[a], #12]\n\t" - "ldr r11, [%[b], #20]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[6] = 8 */ - "ldr r10, [%[a], #8]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[7] = 8 */ - "ldr r11, [%[b], #28]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" + "umlal r11, r6, r0, r7\n\t" + "ldr r0, [sp, #40]\n\t" + "umaal r12, r6, r1, r7\n\t" + "add r0, r0, #16\n\t" + "umaal r4, r6, r2, r7\n\t" + "sub lr, lr, #16\n\t" + "umaal r5, r6, r3, r7\n\t" + "ldm r0, {r0, r1, r2, r3}\n\t" "str r6, [sp, #32]\n\t" - "ldr r8, [%[a], #28]\n\t" - "mov lr, r11\n\t" - /* A[2] * B[7] = 9 */ - "umull r3, r12, r10, lr\n\t" - "adds r4, r4, r3\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * B[6] = 9 */ - "ldr r10, [%[a], #12]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[4] * B[5] = 9 */ - "ldr r10, [%[a], #16]\n\t" - "ldr r11, [%[b], #20]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * B[4] = 9 */ - "ldr r10, [%[a], #20]\n\t" - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[6] * B[3] = 9 */ - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[7] * B[2] = 9 */ - "ldr r11, [%[b], #8]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #36]\n\t" - /* A[7] * B[3] = 10 */ - "ldr r11, [%[b], #12]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r5, r5, r3\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[4] = 10 */ - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r7, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[5] = 10 */ - "ldr r11, [%[b], #20]\n\t" - "umull r3, r12, r10, r11\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[6] = 10 */ - "ldr r10, [%[a], #16]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[7] = 10 */ - "ldr r10, [%[a], #12]\n\t" - "umull r3, r12, r10, lr\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #40]\n\t" - /* A[4] * B[7] = 11 */ - "ldr r10, [%[a], #16]\n\t" - "umull r3, r12, r10, lr\n\t" - "adds r6, r6, r3\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[6] = 11 */ - "ldr r10, [%[a], #20]\n\t" - "umull r3, r12, r10, r9\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[5] = 11 */ - "umull r3, r12, r7, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[4] = 11 */ - "ldr r11, [%[b], #16]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r6, r6, r3\n\t" - "adcs r4, r4, r12\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #44]\n\t" - /* A[7] * B[5] = 12 */ - "ldr r11, [%[b], #20]\n\t" - "umull r3, r12, r8, r11\n\t" - "adds r4, r4, r3\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[6] * B[6] = 12 */ - "umull r3, r12, r7, r9\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * B[7] = 12 */ - "umull r3, r12, r10, lr\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r12\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #48]\n\t" - /* A[6] * B[7] = 13 */ - "umull r3, r12, r7, lr\n\t" - "adds r5, r5, r3\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[6] = 13 */ - "umull r3, r12, r8, r9\n\t" - "adds r5, r5, r3\n\t" - "adcs r6, r6, r12\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #52]\n\t" - /* A[7] * B[7] = 14 */ - "umull r3, r12, r8, lr\n\t" - "adds r6, r6, r3\n\t" - "adc r4, r4, r12\n\t" - "str r6, [sp, #56]\n\t" - "str r4, [sp, #60]\n\t" + "ldm lr!, {r6}\n\t" + "mov r7, #0\n\t" + "umlal r8, r7, r0, r6\n\t" + "umaal r9, r7, r1, r6\n\t" + "str r8, [sp, #16]\n\t" + "umaal r10, r7, r2, r6\n\t" + "umaal r11, r7, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r8, #0\n\t" + "umlal r9, r8, r0, r6\n\t" + "umaal r10, r8, r1, r6\n\t" + "str r9, [sp, #20]\n\t" + "umaal r11, r8, r2, r6\n\t" + "umaal r12, r8, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r9, #0\n\t" + "umlal r10, r9, r0, r6\n\t" + "umaal r11, r9, r1, r6\n\t" + "str r10, [sp, #24]\n\t" + "umaal r12, r9, r2, r6\n\t" + "umaal r4, r9, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r10, #0\n\t" + "umlal r11, r10, r0, r6\n\t" + "umaal r12, r10, r1, r6\n\t" + "str r11, [sp, #28]\n\t" + "umaal r4, r10, r2, r6\n\t" + "umaal r5, r10, r3, r6\n\t" + "ldm lr!, {r11}\n\t" + "umaal r12, r7, r0, r11\n\t" + "umaal r4, r7, r1, r11\n\t" + "ldr r6, [sp, #32]\n\t" + "umaal r5, r7, r2, r11\n\t" + "umaal r6, r7, r3, r11\n\t" + "ldm lr!, {r11}\n\t" + "umaal r4, r8, r0, r11\n\t" + "umaal r5, r8, r1, r11\n\t" + "umaal r6, r8, r2, r11\n\t" + "umaal r7, r8, r3, r11\n\t" + "ldm lr, {r11, lr}\n\t" + "umaal r5, r9, r0, r11\n\t" + "umaal r6, r10, r0, lr\n\t" + "umaal r6, r9, r1, r11\n\t" + "umaal r7, r10, r1, lr\n\t" + "umaal r7, r9, r2, r11\n\t" + "umaal r8, r10, r2, lr\n\t" + "umaal r8, r9, r3, r11\n\t" + "umaal r9, r10, r3, lr\n\t" /* Reduce */ - /* Load bottom half */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" -#else - "ldrd r4, r5, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #8]\n\t" - "ldr r7, [sp, #12]\n\t" -#else - "ldrd r6, r7, [sp, #8]\n\t" -#endif + "ldr r0, [sp, #28]\n\t" + "mov lr, #37\n\t" + "umaal r10, r0, r10, lr\n\t" + "mov lr, #19\n\t" + "lsl r0, r0, #1\n\t" + "orr r0, r0, r10, lsr #31\n\t" + "mul r11, r0, lr\n\t" + "pop {r0-r2}\n\t" + "mov lr, #38\n\t" + "umaal r0, r11, r12, lr\n\t" + "umaal r1, r11, r4, lr\n\t" + "umaal r2, r11, r5, lr\n\t" + "pop {r3-r5}\n\t" + "umaal r3, r11, r6, lr\n\t" + "umaal r4, r11, r7, lr\n\t" + "umaal r5, r11, r8, lr\n\t" + "pop {r6}\n\t" + "bfc r10, #31, #1\n\t" + "umaal r6, r11, r9, lr\n\t" + "add r7, r10, r11\n\t" + "ldr lr, [sp, #8]\n\t" + /* Store */ + "stm lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + "add sp, sp, #16\n\t" + : + : + : "memory", "lr" + ); +} + +void fe_mul(fe r_p, const fe a_p, const fe b_p) +{ + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + register const fe b asm ("r2") = b_p; + + __asm__ __volatile__ ( + "bl fe_mul_op\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void fe_sq_op(void); +void fe_sq_op() +{ + + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str r0, [sp, #28]\n\t" + "ldm r1, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + /* Square */ + "umull r9, r10, r0, r0\n\t" + "umull r11, r12, r0, r1\n\t" + "adds r11, r11, r11\n\t" + "mov lr, #0\n\t" + "umaal r10, r11, lr, lr\n\t" + "stm sp, {r9, r10}\n\t" + "mov r8, lr\n\t" + "umaal r8, r12, r0, r2\n\t" + "adcs r8, r8, r8\n\t" + "umaal r8, r11, r1, r1\n\t" + "umull r9, r10, r0, r3\n\t" + "umaal r9, r12, r1, r2\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #16]\n\t" - "ldr r9, [sp, #20]\n\t" + "str r8, [sp, #8]\n\t" + "str r9, [sp, #12]\n\t" #else - "ldrd r8, r9, [sp, #16]\n\t" + "strd r8, r9, [sp, #8]\n\t" #endif + "mov r9, lr\n\t" + "umaal r9, r10, r0, r4\n\t" + "umaal r9, r12, r1, r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r2, r2\n\t" + "str r9, [sp, #16]\n\t" + "umull r9, r8, r0, r5\n\t" + "umaal r9, r12, r1, r4\n\t" + "umaal r9, r10, r2, r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" + "str r9, [sp, #20]\n\t" + "mov r9, lr\n\t" + "umaal r9, r8, r0, r6\n\t" + "umaal r9, r12, r1, r5\n\t" + "umaal r9, r10, r2, r4\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r3, r3\n\t" + "str r9, [sp, #24]\n\t" + "umull r0, r9, r0, r7\n\t" + "umaal r0, r8, r1, r6\n\t" + "umaal r0, r12, r2, r5\n\t" + "umaal r0, r10, r3, r4\n\t" + "adcs r0, r0, r0\n\t" + "umaal r0, r11, lr, lr\n\t" + /* R[7] = r0 */ + "umaal r9, r8, r1, r7\n\t" + "umaal r9, r10, r2, r6\n\t" + "umaal r12, r9, r3, r5\n\t" + "adcs r12, r12, r12\n\t" + "umaal r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "umaal r9, r8, r2, r7\n\t" + "umaal r10, r9, r3, r6\n\t" + "mov r2, lr\n\t" + "umaal r10, r2, r4, r5\n\t" + "adcs r10, r10, r10\n\t" + "umaal r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "umaal r2, r8, r3, r7\n\t" + "umaal r2, r9, r4, r6\n\t" + "adcs r3, r2, r2\n\t" + "umaal r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "mov r1, lr\n\t" + "umaal r1, r8, r4, r7\n\t" + "umaal r1, r9, r5, r6\n\t" + "adcs r4, r1, r1\n\t" + "umaal r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "umaal r8, r9, r5, r7\n\t" + "adcs r8, r8, r8\n\t" + "umaal r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "mov r5, lr\n\t" + "umaal r5, r9, r6, r7\n\t" + "adcs r5, r5, r5\n\t" + "umaal r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "adcs r9, r9, r9\n\t" + "umaal r9, r5, r7, r7\n\t" + "adcs r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + /* Reduce */ + "mov r6, #37\n\t" + "umaal r7, r0, r7, r6\n\t" + "mov r6, #19\n\t" + "lsl r0, r0, #1\n\t" + "orr r0, r0, r7, lsr #31\n\t" + "mul lr, r0, r6\n\t" + "pop {r0-r1}\n\t" + "mov r6, #38\n\t" + "umaal r0, lr, r12, r6\n\t" + "umaal r1, lr, r11, r6\n\t" + "mov r12, r3\n\t" + "mov r11, r4\n\t" + "pop {r2-r4}\n\t" + "umaal r2, lr, r10, r6\n\t" + "umaal r3, lr, r12, r6\n\t" + "umaal r4, lr, r11, r6\n\t" + "mov r12, r6\n\t" + "pop {r5-r6}\n\t" + "umaal r5, lr, r8, r12\n\t" + "bfc r7, #31, #1\n\t" + "umaal r6, lr, r9, r12\n\t" + "add r7, r7, lr\n\t" + "pop {lr}\n\t" + /* Store */ + "stm lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + : + : + : "memory", "lr" + ); +} + +void fe_sq(fe r_p, const fe a_p) +{ + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + + __asm__ __volatile__ ( + "bl fe_sq_op\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "r11" + ); +} + +void fe_mul121666(fe r_p, fe a_p) +{ + register fe r asm ("r0") = r_p; + register fe a asm ("r1") = a_p; + + __asm__ __volatile__ ( + /* Multiply by 121666 */ + "ldm %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #24]\n\t" - "ldr r11, [sp, #28]\n\t" + "mov lr, #0xdb\n\t" + "lsl lr, lr, #8\n\t" + "add lr, lr, #0x42\n\t" #else - "ldrd r10, r11, [sp, #24]\n\t" + "mov lr, #0xdb42\n\t" #endif - "lsr r3, r11, #31\n\t" - "and r11, r11, #0x7fffffff\n\t" + "movt lr, #1\n\t" + "umull r2, r10, r2, lr\n\t" + "sub r12, lr, #1\n\t" + "umaal r3, r10, r3, r12\n\t" + "umaal r4, r10, r4, r12\n\t" + "umaal r5, r10, r5, r12\n\t" + "umaal r6, r10, r6, r12\n\t" + "umaal r7, r10, r7, r12\n\t" + "umaal r8, r10, r8, r12\n\t" "mov lr, #19\n\t" - "ldr %[a], [sp, #32]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "adds r4, r4, r3\n\t" - "mov %[b], #0\n\t" - "adcs r5, r5, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #36]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r5, r5, r3\n\t" - "mov %[b], #0\n\t" - "adcs r6, r6, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #40]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r6, r6, r3\n\t" - "mov %[b], #0\n\t" - "adcs r7, r7, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #44]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r7, r7, r3\n\t" - "mov %[b], #0\n\t" - "adcs r8, r8, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #48]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r8, r8, r3\n\t" - "mov %[b], #0\n\t" - "adcs r9, r9, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #52]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r9, r9, r3\n\t" - "mov %[b], #0\n\t" - "adcs r10, r10, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #56]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "add r12, r12, %[b]\n\t" - "adds r10, r10, r3\n\t" - "mov %[b], #0\n\t" - "adcs r11, r11, r12\n\t" - "adc %[b], %[b], #0\n\t" - "lsr r3, %[a], #31\n\t" - "ldr %[a], [sp, #60]\n\t" - "orr r3, r3, %[a], lsl #1\n\t" - "umull r3, r12, lr, r3\n\t" - "adds r11, r11, r3\n\t" - "adc r3, r12, %[b]\n\t" - /* Overflow */ - "lsl r3, r3, #1\n\t" - "orr r3, r3, r11, lsr #31\n\t" - "mul r3, r3, lr\n\t" - "and r11, r11, #0x7fffffff\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adc r11, r11, #0\n\t" - /* Reduce if top bit set */ - "asr r3, r11, #31\n\t" - "and r3, r3, lr\n\t" - "and r11, r11, #0x7fffffff\n\t" - "adds r4, r4, r3\n\t" + "umaal r9, r10, r9, r12\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "mul r10, r10, lr\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" "adcs r5, r5, #0\n\t" "adcs r6, r6, #0\n\t" "adcs r7, r7, #0\n\t" + "bfc r9, #31, #1\n\t" "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adc r11, r11, #0\n\t" - /* Store */ + "adc r9, r9, #0\n\t" + "stm %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + ); +} + +#ifndef WC_NO_CACHE_RESISTANT +int curve25519(byte* r_p, const byte* n_p, const byte* a_p) +{ + register byte* r asm ("r0") = r_p; + register const byte* n asm ("r1") = n_p; + register const byte* a asm ("r2") = a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0xbc\n\t" + "str %[r], [sp, #160]\n\t" + "str %[n], [sp, #164]\n\t" + "str %[a], [sp, #168]\n\t" + "mov %[n], #0\n\t" + "str %[n], [sp, #172]\n\t" + /* Set one */ + "mov r10, #1\n\t" + "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" + "str r10, [%[r]]\n\t" + "str r11, [%[r], #4]\n\t" #else - "strd r4, r5, [%[r]]\n\t" + "strd r10, r11, [%[r]]\n\t" #endif + "mov r10, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" + "str r10, [%[r], #8]\n\t" + "str r11, [%[r], #12]\n\t" #else - "strd r6, r7, [%[r], #8]\n\t" + "strd r10, r11, [%[r], #8]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" + "str r10, [%[r], #16]\n\t" + "str r11, [%[r], #20]\n\t" #else - "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [%[r], #16]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "str r10, [%[r], #24]\n\t" @@ -3045,552 +2847,137 @@ void fe_mul(fe r_p, const fe a_p, const fe b_p) #else "strd r10, r11, [%[r], #24]\n\t" #endif - "add sp, sp, #0x40\n\t" - : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); -} - -void fe_sq(fe r_p, const fe a_p) -{ - register fe r asm ("r0") = r_p; - register const fe a asm ("r1") = a_p; - - __asm__ __volatile__ ( - "sub sp, sp, #0x40\n\t" - /* Square */ - "ldr r7, [%[a]]\n\t" - "ldr r8, [%[a], #4]\n\t" - "ldr r9, [%[a], #8]\n\t" - "ldr r10, [%[a], #12]\n\t" - "ldr r12, [%[a], #16]\n\t" - /* A[0] * A[0] = 0 */ - "umull r4, r5, r7, r7\n\t" - "str r4, [sp]\n\t" - /* A[0] * A[1] = 1 */ - "umull r2, r3, r7, r8\n\t" - "mov r6, #0\n\t" - "adds r5, r5, r2\n\t" - "adc r6, r6, r3\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #4]\n\t" - /* A[1] * A[1] = 2 */ - "umull r2, r3, r8, r8\n\t" - "adds r6, r6, r2\n\t" - "adc r4, r4, r3\n\t" - /* A[0] * A[2] = 2 */ - "umull r2, r3, r7, r9\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #8]\n\t" - /* A[0] * A[3] = 3 */ - "umull r2, r3, r7, r10\n\t" - "adds r4, r4, r2\n\t" - "adc r5, r5, r3\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[2] = 3 */ - "umull r2, r3, r8, r9\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #12]\n\t" - /* A[2] * A[2] = 4 */ - "umull r2, r3, r9, r9\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[3] = 4 */ - "umull r2, r3, r8, r10\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[0] * A[4] = 4 */ - "umull r2, r3, r7, r12\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #16]\n\t" - /* A[0] * A[5] = 5 */ - "ldr r11, [%[a], #20]\n\t" - "umull r2, r3, r7, r11\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[4] = 5 */ - "umull r2, r3, r8, r12\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * A[3] = 5 */ - "umull r2, r3, r9, r10\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #20]\n\t" - /* A[3] * A[3] = 6 */ - "umull r2, r3, r10, r10\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * A[4] = 6 */ - "umull r2, r3, r9, r12\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[5] = 6 */ - "umull r2, r3, r8, r11\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[0] * A[6] = 6 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r7, r11\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #24]\n\t" - /* A[0] * A[7] = 7 */ - "ldr r11, [%[a], #28]\n\t" - "umull r2, r3, r7, r11\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[6] = 7 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r8, r11\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * A[5] = 7 */ - "ldr r11, [%[a], #20]\n\t" - "umull r2, r3, r9, r11\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * A[4] = 7 */ - "umull r2, r3, r10, r12\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #28]\n\t" - /* A[4] * A[4] = 8 */ - "umull r2, r3, r12, r12\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * A[5] = 8 */ - "umull r2, r3, r10, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * A[6] = 8 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r9, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[7] = 8 */ - "ldr r11, [%[a], #28]\n\t" - "umull r2, r3, r8, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #32]\n\t" - "ldr r7, [%[a], #20]\n\t" - /* A[2] * A[7] = 9 */ - "umull r2, r3, r9, r11\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * A[6] = 9 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r10, r11\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[4] * A[5] = 9 */ - "umull r2, r3, r12, r7\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #36]\n\t" - "mov r8, r11\n\t" - /* A[5] * A[5] = 10 */ - "umull r2, r3, r7, r7\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * A[6] = 10 */ - "umull r2, r3, r12, r8\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * A[7] = 10 */ - "ldr r11, [%[a], #28]\n\t" - "umull r2, r3, r10, r11\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #40]\n\t" - "mov r9, r11\n\t" - /* A[4] * A[7] = 11 */ - "umull r2, r3, r12, r9\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * A[6] = 11 */ - "umull r2, r3, r7, r8\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #44]\n\t" - /* A[6] * A[6] = 12 */ - "umull r2, r3, r8, r8\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * A[7] = 12 */ - "umull r2, r3, r7, r9\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #48]\n\t" - /* A[6] * A[7] = 13 */ - "umull r2, r3, r8, r9\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #52]\n\t" - /* A[7] * A[7] = 14 */ - "umull r2, r3, r9, r9\n\t" - "adds r6, r6, r2\n\t" - "adc r4, r4, r3\n\t" - "str r6, [sp, #56]\n\t" - "str r4, [sp, #60]\n\t" - /* Reduce */ - /* Load bottom half */ + /* Set zero */ + "mov r10, #0\n\t" + "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" + "str r10, [sp]\n\t" + "str r11, [sp, #4]\n\t" #else - "ldrd r4, r5, [sp]\n\t" + "strd r10, r11, [sp]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #8]\n\t" - "ldr r7, [sp, #12]\n\t" + "str r10, [sp, #8]\n\t" + "str r11, [sp, #12]\n\t" #else - "ldrd r6, r7, [sp, #8]\n\t" + "strd r10, r11, [sp, #8]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #16]\n\t" - "ldr r9, [sp, #20]\n\t" + "str r10, [sp, #16]\n\t" + "str r11, [sp, #20]\n\t" #else - "ldrd r8, r9, [sp, #16]\n\t" + "strd r10, r11, [sp, #16]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #24]\n\t" - "ldr r11, [sp, #28]\n\t" + "str r10, [sp, #24]\n\t" + "str r11, [sp, #28]\n\t" #else - "ldrd r10, r11, [sp, #24]\n\t" + "strd r10, r11, [sp, #24]\n\t" #endif - "lsr r2, r11, #31\n\t" - "and r11, r11, #0x7fffffff\n\t" - "mov r12, #19\n\t" - "ldr %[a], [sp, #32]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "adds r4, r4, r2\n\t" - "mov lr, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #36]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r5, r5, r2\n\t" - "mov lr, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #40]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r6, r6, r2\n\t" - "mov lr, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #44]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r7, r7, r2\n\t" - "mov lr, #0\n\t" - "adcs r8, r8, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #48]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r8, r8, r2\n\t" - "mov lr, #0\n\t" - "adcs r9, r9, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #52]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r9, r9, r2\n\t" - "mov lr, #0\n\t" - "adcs r10, r10, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #56]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r10, r10, r2\n\t" - "mov lr, #0\n\t" - "adcs r11, r11, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #31\n\t" - "ldr %[a], [sp, #60]\n\t" - "orr r2, r2, %[a], lsl #1\n\t" - "umull r2, r3, r12, r2\n\t" - "adds r11, r11, r2\n\t" - "adc r2, r3, lr\n\t" - /* Overflow */ - "lsl r2, r2, #1\n\t" - "orr r2, r2, r11, lsr #31\n\t" - "mul r2, r2, r12\n\t" - "and r11, r11, #0x7fffffff\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adc r11, r11, #0\n\t" - /* Reduce if top bit set */ - "asr r2, r11, #31\n\t" - "and r2, r2, r12\n\t" - "and r11, r11, #0x7fffffff\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adc r11, r11, #0\n\t" - /* Store */ + /* Set one */ + "mov r10, #1\n\t" + "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" + "str r10, [sp, #32]\n\t" + "str r11, [sp, #36]\n\t" #else - "strd r4, r5, [%[r]]\n\t" + "strd r10, r11, [sp, #32]\n\t" #endif + "mov r10, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" + "str r10, [sp, #40]\n\t" + "str r11, [sp, #44]\n\t" #else - "strd r6, r7, [%[r], #8]\n\t" + "strd r10, r11, [sp, #40]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" + "str r10, [sp, #48]\n\t" + "str r11, [sp, #52]\n\t" #else - "strd r8, r9, [%[r], #16]\n\t" + "strd r10, r11, [sp, #48]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" + "str r10, [sp, #56]\n\t" + "str r11, [sp, #60]\n\t" #else - "strd r10, r11, [%[r], #24]\n\t" + "strd r10, r11, [sp, #56]\n\t" #endif - "add sp, sp, #0x40\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); -} - -void fe_mul121666(fe r_p, fe a_p) -{ - register fe r asm ("r0") = r_p; - register fe a asm ("r1") = a_p; - - __asm__ __volatile__ ( - /* Multiply by 121666 */ + "add r3, sp, #0x40\n\t" + /* Copy */ + "ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "stm r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "mov %[n], #30\n\t" + "str %[n], [sp, #180]\n\t" + "mov %[a], #28\n\t" + "str %[a], [sp, #176]\n\t" + "\n" + "L_curve25519_words_%=: \n\t" + "\n" + "L_curve25519_bits_%=: \n\t" + "ldr %[n], [sp, #164]\n\t" + "ldr %[a], [%[n], r2]\n\t" + "ldr %[n], [sp, #180]\n\t" + "lsr %[a], %[a], %[n]\n\t" + "and %[a], %[a], #1\n\t" + "str %[a], [sp, #184]\n\t" + "ldr %[n], [sp, #172]\n\t" + "eor %[n], %[n], %[a]\n\t" + "str %[n], [sp, #172]\n\t" + "ldr %[r], [sp, #160]\n\t" + /* Conditional Swap */ + "rsb %[n], %[n], #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" + "ldr r4, [%[r]]\n\t" + "ldr r5, [%[r], #4]\n\t" #else - "ldrd r2, r3, [%[a]]\n\t" + "ldrd r4, r5, [%[r]]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" + "ldr r6, [sp, #64]\n\t" + "ldr r7, [sp, #68]\n\t" #else - "ldrd r4, r5, [%[a], #8]\n\t" + "ldrd r6, r7, [sp, #64]\n\t" #endif + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[a], #20]\n\t" + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" #else - "ldrd r6, r7, [%[a], #16]\n\t" + "strd r4, r5, [%[r]]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [%[a], #24]\n\t" - "ldr r9, [%[a], #28]\n\t" + "str r6, [sp, #64]\n\t" + "str r7, [sp, #68]\n\t" #else - "ldrd r8, r9, [%[a], #24]\n\t" + "strd r6, r7, [sp, #64]\n\t" #endif - "movw lr, #0xdb42\n\t" - "movt lr, #1\n\t" - "umull r2, r10, r2, lr\n\t" - "umull r3, r12, r3, lr\n\t" - "adds r3, r3, r10\n\t" - "adc r10, r12, #0\n\t" - "umull r4, r12, r4, lr\n\t" - "adds r4, r4, r10\n\t" - "adc r10, r12, #0\n\t" - "umull r5, r12, r5, lr\n\t" - "adds r5, r5, r10\n\t" - "adc r10, r12, #0\n\t" - "umull r6, r12, r6, lr\n\t" - "adds r6, r6, r10\n\t" - "adc r10, r12, #0\n\t" - "umull r7, r12, r7, lr\n\t" - "adds r7, r7, r10\n\t" - "adc r10, r12, #0\n\t" - "umull r8, r12, r8, lr\n\t" - "adds r8, r8, r10\n\t" - "adc r10, r12, #0\n\t" - "umull r9, r12, r9, lr\n\t" - "adds r9, r9, r10\n\t" - "adc r10, r12, #0\n\t" - "mov lr, #19\n\t" - "lsl r10, r10, #1\n\t" - "orr r10, r10, r9, lsr #31\n\t" - "mul r10, r10, lr\n\t" - "and r9, r9, #0x7fffffff\n\t" - "adds r2, r2, r10\n\t" - "adcs r3, r3, #0\n\t" - "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adc r9, r9, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r2, [%[r]]\n\t" - "str r3, [%[r], #4]\n\t" + "ldr r4, [%[r], #8]\n\t" + "ldr r5, [%[r], #12]\n\t" #else - "strd r2, r3, [%[r]]\n\t" + "ldrd r4, r5, [%[r], #8]\n\t" #endif +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "ldr r6, [sp, #72]\n\t" + "ldr r7, [sp, #76]\n\t" +#else + "ldrd r6, r7, [sp, #72]\n\t" +#endif + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "str r4, [%[r], #8]\n\t" "str r5, [%[r], #12]\n\t" @@ -3598,937 +2985,153 @@ void fe_mul121666(fe r_p, fe a_p) "strd r4, r5, [%[r], #8]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[r], #16]\n\t" - "str r7, [%[r], #20]\n\t" + "str r6, [sp, #72]\n\t" + "str r7, [sp, #76]\n\t" #else - "strd r6, r7, [%[r], #16]\n\t" + "strd r6, r7, [sp, #72]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #24]\n\t" - "str r9, [%[r], #28]\n\t" + "ldr r4, [%[r], #16]\n\t" + "ldr r5, [%[r], #20]\n\t" #else - "strd r8, r9, [%[r], #24]\n\t" + "ldrd r4, r5, [%[r], #16]\n\t" #endif - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" - ); -} - -void fe_sq2(fe r_p, const fe a_p) -{ - register fe r asm ("r0") = r_p; - register const fe a asm ("r1") = a_p; - - __asm__ __volatile__ ( - "sub sp, sp, #0x40\n\t" - /* Square * 2 */ - "ldr r7, [%[a]]\n\t" - "ldr r8, [%[a], #4]\n\t" - "ldr r9, [%[a], #8]\n\t" - "ldr r10, [%[a], #12]\n\t" - "ldr r12, [%[a], #16]\n\t" - /* A[0] * A[0] = 0 */ - "umull r4, r5, r7, r7\n\t" - "str r4, [sp]\n\t" - /* A[0] * A[1] = 1 */ - "umull r2, r3, r7, r8\n\t" - "mov r6, #0\n\t" - "adds r5, r5, r2\n\t" - "adc r6, r6, r3\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #4]\n\t" - /* A[1] * A[1] = 2 */ - "umull r2, r3, r8, r8\n\t" - "adds r6, r6, r2\n\t" - "adc r4, r4, r3\n\t" - /* A[0] * A[2] = 2 */ - "umull r2, r3, r7, r9\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #8]\n\t" - /* A[0] * A[3] = 3 */ - "umull r2, r3, r7, r10\n\t" - "adds r4, r4, r2\n\t" - "adc r5, r5, r3\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[2] = 3 */ - "umull r2, r3, r8, r9\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #12]\n\t" - /* A[2] * A[2] = 4 */ - "umull r2, r3, r9, r9\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[3] = 4 */ - "umull r2, r3, r8, r10\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[0] * A[4] = 4 */ - "umull r2, r3, r7, r12\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #16]\n\t" - /* A[0] * A[5] = 5 */ - "ldr r11, [%[a], #20]\n\t" - "umull r2, r3, r7, r11\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[4] = 5 */ - "umull r2, r3, r8, r12\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * A[3] = 5 */ - "umull r2, r3, r9, r10\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #20]\n\t" - /* A[3] * A[3] = 6 */ - "umull r2, r3, r10, r10\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[2] * A[4] = 6 */ - "umull r2, r3, r9, r12\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[1] * A[5] = 6 */ - "umull r2, r3, r8, r11\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[0] * A[6] = 6 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r7, r11\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #24]\n\t" - /* A[0] * A[7] = 7 */ - "ldr r11, [%[a], #28]\n\t" - "umull r2, r3, r7, r11\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[6] = 7 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r8, r11\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * A[5] = 7 */ - "ldr r11, [%[a], #20]\n\t" - "umull r2, r3, r9, r11\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * A[4] = 7 */ - "umull r2, r3, r10, r12\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #28]\n\t" - /* A[4] * A[4] = 8 */ - "umull r2, r3, r12, r12\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * A[5] = 8 */ - "umull r2, r3, r10, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * A[6] = 8 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r9, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * A[7] = 8 */ - "ldr r11, [%[a], #28]\n\t" - "umull r2, r3, r8, r11\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #32]\n\t" - "ldr r7, [%[a], #20]\n\t" - /* A[2] * A[7] = 9 */ - "umull r2, r3, r9, r11\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[3] * A[6] = 9 */ - "ldr r11, [%[a], #24]\n\t" - "umull r2, r3, r10, r11\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[4] * A[5] = 9 */ - "umull r2, r3, r12, r7\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #36]\n\t" - "mov r8, r11\n\t" - /* A[5] * A[5] = 10 */ - "umull r2, r3, r7, r7\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * A[6] = 10 */ - "umull r2, r3, r12, r8\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * A[7] = 10 */ - "ldr r11, [%[a], #28]\n\t" - "umull r2, r3, r10, r11\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #40]\n\t" - "mov r9, r11\n\t" - /* A[4] * A[7] = 11 */ - "umull r2, r3, r12, r9\n\t" - "adds r6, r6, r2\n\t" - "mov r5, #0\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * A[6] = 11 */ - "umull r2, r3, r7, r8\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "adds r6, r6, r2\n\t" - "adcs r4, r4, r3\n\t" - "adc r5, r5, #0\n\t" - "str r6, [sp, #44]\n\t" - /* A[6] * A[6] = 12 */ - "umull r2, r3, r8, r8\n\t" - "adds r4, r4, r2\n\t" - "mov r6, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - /* A[5] * A[7] = 12 */ - "umull r2, r3, r7, r9\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, r3\n\t" - "adc r6, r6, #0\n\t" - "str r4, [sp, #48]\n\t" - /* A[6] * A[7] = 13 */ - "umull r2, r3, r8, r9\n\t" - "adds r5, r5, r2\n\t" - "mov r4, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r2\n\t" - "adcs r6, r6, r3\n\t" - "adc r4, r4, #0\n\t" - "str r5, [sp, #52]\n\t" - /* A[7] * A[7] = 14 */ - "umull r2, r3, r9, r9\n\t" - "adds r6, r6, r2\n\t" - "adc r4, r4, r3\n\t" - "str r6, [sp, #56]\n\t" - "str r4, [sp, #60]\n\t" - /* Double and Reduce */ - /* Load bottom half */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" + "ldr r6, [sp, #80]\n\t" + "ldr r7, [sp, #84]\n\t" #else - "ldrd r4, r5, [sp]\n\t" + "ldrd r6, r7, [sp, #80]\n\t" #endif + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #8]\n\t" - "ldr r7, [sp, #12]\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" #else - "ldrd r6, r7, [sp, #8]\n\t" + "strd r4, r5, [%[r], #16]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #16]\n\t" - "ldr r9, [sp, #20]\n\t" + "str r6, [sp, #80]\n\t" + "str r7, [sp, #84]\n\t" #else - "ldrd r8, r9, [sp, #16]\n\t" + "strd r6, r7, [sp, #80]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #24]\n\t" - "ldr r11, [sp, #28]\n\t" + "ldr r4, [%[r], #24]\n\t" + "ldr r5, [%[r], #28]\n\t" #else - "ldrd r10, r11, [sp, #24]\n\t" + "ldrd r4, r5, [%[r], #24]\n\t" #endif - "lsr r2, r11, #30\n\t" - "lsl r11, r11, #1\n\t" - "orr r11, r11, r10, lsr #31\n\t" - "lsl r10, r10, #1\n\t" - "orr r10, r10, r9, lsr #31\n\t" - "lsl r9, r9, #1\n\t" - "orr r9, r9, r8, lsr #31\n\t" - "lsl r8, r8, #1\n\t" - "orr r8, r8, r7, lsr #31\n\t" - "lsl r7, r7, #1\n\t" - "orr r7, r7, r6, lsr #31\n\t" - "lsl r6, r6, #1\n\t" - "orr r6, r6, r5, lsr #31\n\t" - "lsl r5, r5, #1\n\t" - "orr r5, r5, r4, lsr #31\n\t" - "lsl r4, r4, #1\n\t" - "and r11, r11, #0x7fffffff\n\t" - "mov r12, #19\n\t" - "ldr %[a], [sp, #32]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "adds r4, r4, r2\n\t" - "mov lr, #0\n\t" - "adcs r5, r5, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #36]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r5, r5, r2\n\t" - "mov lr, #0\n\t" - "adcs r6, r6, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #40]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r6, r6, r2\n\t" - "mov lr, #0\n\t" - "adcs r7, r7, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #44]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r7, r7, r2\n\t" - "mov lr, #0\n\t" - "adcs r8, r8, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #48]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r8, r8, r2\n\t" - "mov lr, #0\n\t" - "adcs r9, r9, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #52]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r9, r9, r2\n\t" - "mov lr, #0\n\t" - "adcs r10, r10, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #56]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "add r3, r3, lr\n\t" - "adds r10, r10, r2\n\t" - "mov lr, #0\n\t" - "adcs r11, r11, r3\n\t" - "adc lr, lr, #0\n\t" - "lsr r2, %[a], #30\n\t" - "ldr %[a], [sp, #60]\n\t" - "orr r2, r2, %[a], lsl #2\n\t" - "umull r2, r3, r12, r2\n\t" - "adds r11, r11, r2\n\t" - "adc r2, r3, lr\n\t" - /* Overflow */ - "lsl r2, r2, #1\n\t" - "orr r2, r2, r11, lsr #31\n\t" - "mul r2, r2, r12\n\t" - "and r11, r11, #0x7fffffff\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adc r11, r11, #0\n\t" - /* Reduce if top bit set */ - "asr r2, r11, #31\n\t" - "and r2, r2, r12\n\t" - "and r11, r11, #0x7fffffff\n\t" - "adds r4, r4, r2\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adc r11, r11, #0\n\t" - /* Store */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" + "ldr r6, [sp, #88]\n\t" + "ldr r7, [sp, #92]\n\t" #else - "strd r4, r5, [%[r]]\n\t" + "ldrd r6, r7, [sp, #88]\n\t" #endif + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" #else - "strd r6, r7, [%[r], #8]\n\t" + "strd r4, r5, [%[r], #24]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" + "str r6, [sp, #88]\n\t" + "str r7, [sp, #92]\n\t" #else - "strd r8, r9, [%[r], #16]\n\t" + "strd r6, r7, [sp, #88]\n\t" #endif + "ldr %[n], [sp, #172]\n\t" + /* Conditional Swap */ + "rsb %[n], %[n], #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" + "ldr r4, [sp]\n\t" + "ldr r5, [sp, #4]\n\t" #else - "strd r10, r11, [%[r], #24]\n\t" + "ldrd r4, r5, [sp]\n\t" #endif - "add sp, sp, #0x40\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); -} - -void fe_invert(fe r_p, const fe a_p) -{ - register fe r asm ("r0") = r_p; - register const fe a asm ("r1") = a_p; - - __asm__ __volatile__ ( - "sub sp, sp, #0x88\n\t" - /* Invert */ - "str %[r], [sp, #128]\n\t" - "str %[a], [sp, #132]\n\t" - "mov r0, sp\n\t" - "ldr r1, [sp, #132]\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "ldr r1, [sp, #132]\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x40\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #4\n\t" - "\n" - "L_fe_invert1_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert1_%=\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #9\n\t" - "\n" - "L_fe_invert2_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert2_%=\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "mov r4, #19\n\t" - "\n" - "L_fe_invert3_%=: \n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert3_%=\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x60\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "mov r4, #10\n\t" - "\n" - "L_fe_invert4_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert4_%=\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #49\n\t" - "\n" - "L_fe_invert5_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert5_%=\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "mov r4, #0x63\n\t" - "\n" - "L_fe_invert6_%=: \n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert6_%=\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x60\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "mov r4, #50\n\t" - "\n" - "L_fe_invert7_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert7_%=\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r4, #5\n\t" - "\n" - "L_fe_invert8_%=: \n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_invert8_%=\n\t" - "ldr r0, [sp, #128]\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "ldr %[a], [sp, #132]\n\t" - "ldr %[r], [sp, #128]\n\t" - "add sp, sp, #0x88\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "lr", "r4" - ); -} - -int curve25519(byte* r_p, const byte* n_p, const byte* a_p) -{ - register byte* r asm ("r0") = r_p; - register const byte* n asm ("r1") = n_p; - register const byte* a asm ("r2") = a_p; - - __asm__ __volatile__ ( - "sub sp, sp, #0xbc\n\t" - "str %[r], [sp, #160]\n\t" - "str %[n], [sp, #164]\n\t" - "str %[a], [sp, #168]\n\t" - "mov %[n], #0\n\t" - "str %[n], [sp, #172]\n\t" - /* Set one */ - "mov r10, #1\n\t" - "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r]]\n\t" - "str r11, [%[r], #4]\n\t" + "ldr r6, [sp, #32]\n\t" + "ldr r7, [sp, #36]\n\t" #else - "strd r10, r11, [%[r]]\n\t" + "ldrd r6, r7, [sp, #32]\n\t" #endif - "mov r10, #0\n\t" + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #8]\n\t" - "str r11, [%[r], #12]\n\t" + "str r4, [sp]\n\t" + "str r5, [sp, #4]\n\t" #else - "strd r10, r11, [%[r], #8]\n\t" + "strd r4, r5, [sp]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #16]\n\t" - "str r11, [%[r], #20]\n\t" + "str r6, [sp, #32]\n\t" + "str r7, [sp, #36]\n\t" #else - "strd r10, r11, [%[r], #16]\n\t" + "strd r6, r7, [sp, #32]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" + "ldr r4, [sp, #8]\n\t" + "ldr r5, [sp, #12]\n\t" #else - "strd r10, r11, [%[r], #24]\n\t" + "ldrd r4, r5, [sp, #8]\n\t" #endif - /* Set zero */ - "mov r10, #0\n\t" - "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp]\n\t" - "str r11, [sp, #4]\n\t" + "ldr r6, [sp, #40]\n\t" + "ldr r7, [sp, #44]\n\t" #else - "strd r10, r11, [sp]\n\t" + "ldrd r6, r7, [sp, #40]\n\t" #endif + "eor r8, r4, r6\n\t" + "eor r9, r5, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r9\n\t" + "eor r6, r6, r8\n\t" + "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #8]\n\t" - "str r11, [sp, #12]\n\t" + "str r4, [sp, #8]\n\t" + "str r5, [sp, #12]\n\t" #else - "strd r10, r11, [sp, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #16]\n\t" - "str r11, [sp, #20]\n\t" -#else - "strd r10, r11, [sp, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #24]\n\t" - "str r11, [sp, #28]\n\t" -#else - "strd r10, r11, [sp, #24]\n\t" -#endif - /* Set one */ - "mov r10, #1\n\t" - "mov r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #32]\n\t" - "str r11, [sp, #36]\n\t" -#else - "strd r10, r11, [sp, #32]\n\t" -#endif - "mov r10, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #40]\n\t" - "str r11, [sp, #44]\n\t" -#else - "strd r10, r11, [sp, #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #48]\n\t" - "str r11, [sp, #52]\n\t" -#else - "strd r10, r11, [sp, #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #56]\n\t" - "str r11, [sp, #60]\n\t" -#else - "strd r10, r11, [sp, #56]\n\t" -#endif - /* Copy */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[a], #4]\n\t" -#else - "ldrd r4, r5, [%[a]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #12]\n\t" -#else - "ldrd r6, r7, [%[a], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #64]\n\t" - "str r5, [sp, #68]\n\t" -#else - "strd r4, r5, [sp, #64]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #72]\n\t" - "str r7, [sp, #76]\n\t" -#else - "strd r6, r7, [sp, #72]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" -#else - "ldrd r4, r5, [%[a], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" -#else - "ldrd r6, r7, [%[a], #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #80]\n\t" - "str r5, [sp, #84]\n\t" -#else - "strd r4, r5, [sp, #80]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #88]\n\t" - "str r7, [sp, #92]\n\t" -#else - "strd r6, r7, [sp, #88]\n\t" -#endif - "mov %[n], #30\n\t" - "str %[n], [sp, #180]\n\t" - "mov %[a], #28\n\t" - "str %[a], [sp, #176]\n\t" - "\n" - "L_curve25519_words_%=: \n\t" - "\n" - "L_curve25519_bits_%=: \n\t" - "ldr %[n], [sp, #164]\n\t" - "ldr %[a], [%[n], r2]\n\t" - "ldr %[n], [sp, #180]\n\t" - "lsr %[a], %[a], %[n]\n\t" - "and %[a], %[a], #1\n\t" - "str %[a], [sp, #184]\n\t" - "ldr %[n], [sp, #172]\n\t" - "eor %[n], %[n], %[a]\n\t" - "str %[n], [sp, #172]\n\t" - "ldr %[r], [sp, #160]\n\t" - /* Conditional Swap */ - "neg %[n], %[n]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r]]\n\t" - "ldr r5, [%[r], #4]\n\t" -#else - "ldrd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #64]\n\t" - "ldr r7, [sp, #68]\n\t" -#else - "ldrd r6, r7, [sp, #64]\n\t" -#endif - "eor r8, r4, r6\n\t" - "eor r9, r5, r7\n\t" - "and r8, r8, %[n]\n\t" - "and r9, r9, %[n]\n\t" - "eor r4, r4, r8\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r8\n\t" - "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" -#else - "strd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #64]\n\t" - "str r7, [sp, #68]\n\t" -#else - "strd r6, r7, [sp, #64]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #8]\n\t" - "ldr r5, [%[r], #12]\n\t" -#else - "ldrd r4, r5, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #72]\n\t" - "ldr r7, [sp, #76]\n\t" -#else - "ldrd r6, r7, [sp, #72]\n\t" -#endif - "eor r8, r4, r6\n\t" - "eor r9, r5, r7\n\t" - "and r8, r8, %[n]\n\t" - "and r9, r9, %[n]\n\t" - "eor r4, r4, r8\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r8\n\t" - "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" -#else - "strd r4, r5, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #72]\n\t" - "str r7, [sp, #76]\n\t" -#else - "strd r6, r7, [sp, #72]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #16]\n\t" - "ldr r5, [%[r], #20]\n\t" -#else - "ldrd r4, r5, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #80]\n\t" - "ldr r7, [sp, #84]\n\t" -#else - "ldrd r6, r7, [sp, #80]\n\t" -#endif - "eor r8, r4, r6\n\t" - "eor r9, r5, r7\n\t" - "and r8, r8, %[n]\n\t" - "and r9, r9, %[n]\n\t" - "eor r4, r4, r8\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r8\n\t" - "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" -#else - "strd r4, r5, [%[r], #16]\n\t" + "strd r4, r5, [sp, #8]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #80]\n\t" - "str r7, [sp, #84]\n\t" + "str r6, [sp, #40]\n\t" + "str r7, [sp, #44]\n\t" #else - "strd r6, r7, [sp, #80]\n\t" + "strd r6, r7, [sp, #40]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #24]\n\t" - "ldr r5, [%[r], #28]\n\t" + "ldr r4, [sp, #16]\n\t" + "ldr r5, [sp, #20]\n\t" #else - "ldrd r4, r5, [%[r], #24]\n\t" + "ldrd r4, r5, [sp, #16]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #88]\n\t" - "ldr r7, [sp, #92]\n\t" + "ldr r6, [sp, #48]\n\t" + "ldr r7, [sp, #52]\n\t" #else - "ldrd r6, r7, [sp, #88]\n\t" + "ldrd r6, r7, [sp, #48]\n\t" #endif "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" @@ -4539,31 +3142,28 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #28]\n\t" + "str r4, [sp, #16]\n\t" + "str r5, [sp, #20]\n\t" #else - "strd r4, r5, [%[r], #24]\n\t" + "strd r4, r5, [sp, #16]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #88]\n\t" - "str r7, [sp, #92]\n\t" + "str r6, [sp, #48]\n\t" + "str r7, [sp, #52]\n\t" #else - "strd r6, r7, [sp, #88]\n\t" + "strd r6, r7, [sp, #48]\n\t" #endif - "ldr %[n], [sp, #172]\n\t" - /* Conditional Swap */ - "neg %[n], %[n]\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" + "ldr r4, [sp, #24]\n\t" + "ldr r5, [sp, #28]\n\t" #else - "ldrd r4, r5, [sp]\n\t" + "ldrd r4, r5, [sp, #24]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #32]\n\t" - "ldr r7, [sp, #36]\n\t" + "ldr r6, [sp, #56]\n\t" + "ldr r7, [sp, #60]\n\t" #else - "ldrd r6, r7, [sp, #32]\n\t" + "ldrd r6, r7, [sp, #56]\n\t" #endif "eor r8, r4, r6\n\t" "eor r9, r5, r7\n\t" @@ -4574,5097 +3174,1297 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp]\n\t" - "str r5, [sp, #4]\n\t" -#else - "strd r4, r5, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #32]\n\t" - "str r7, [sp, #36]\n\t" -#else - "strd r6, r7, [sp, #32]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #8]\n\t" - "ldr r5, [sp, #12]\n\t" + "str r4, [sp, #24]\n\t" + "str r5, [sp, #28]\n\t" #else - "ldrd r4, r5, [sp, #8]\n\t" + "strd r4, r5, [sp, #24]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #40]\n\t" - "ldr r7, [sp, #44]\n\t" + "str r6, [sp, #56]\n\t" + "str r7, [sp, #60]\n\t" #else - "ldrd r6, r7, [sp, #40]\n\t" + "strd r6, r7, [sp, #56]\n\t" #endif - "eor r8, r4, r6\n\t" - "eor r9, r5, r7\n\t" - "and r8, r8, %[n]\n\t" - "and r9, r9, %[n]\n\t" - "eor r4, r4, r8\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r8\n\t" - "eor r7, r7, r9\n\t" + "ldr %[n], [sp, #184]\n\t" + "str %[n], [sp, #172]\n\t" + "mov r3, sp\n\t" + "ldr r2, [sp, #160]\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_add_sub_op\n\t" + "add r3, sp, #32\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "mov r0, sp\n\t" + "bl fe_add_sub_op\n\t" + "ldr r2, [sp, #160]\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x80\n\t" + "mov r1, sp\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq_op\n\t" + "ldr r1, [sp, #160]\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r3, sp\n\t" + "add r2, sp, #32\n\t" + "mov r1, sp\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_add_sub_op\n\t" + "add r2, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sub_op\n\t" + "mov r1, sp\n\t" + "mov r0, sp\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul121666\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_add_op\n\t" + "mov r2, sp\n\t" + "ldr r1, [sp, #168]\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x80\n\t" + "add r1, sp, #0x60\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "ldr %[a], [sp, #176]\n\t" + "ldr %[n], [sp, #180]\n\t" + "subs %[n], %[n], #1\n\t" + "str %[n], [sp, #180]\n\t" + "bge L_curve25519_bits_%=\n\t" + "mov %[n], #31\n\t" + "str %[n], [sp, #180]\n\t" + "subs %[a], %[a], #4\n\t" + "str %[a], [sp, #176]\n\t" + "bge L_curve25519_words_%=\n\t" + /* Invert */ + "add r1, sp, #0\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #4\n\t" + "\n" + "L_curve25519_inv_1_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #9\n\t" + "\n" + "L_curve25519_inv_2_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq_op\n\t" + "mov r12, #19\n\t" + "\n" + "L_curve25519_inv_3_%=: \n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x80\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "mov r12, #10\n\t" + "\n" + "L_curve25519_inv_4_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #49\n\t" + "\n" + "L_curve25519_inv_5_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq_op\n\t" + "mov r12, #0x63\n\t" + "\n" + "L_curve25519_inv_6_%=: \n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x80\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "mov r12, #50\n\t" + "\n" + "L_curve25519_inv_7_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "mov r12, #5\n\t" + "\n" + "L_curve25519_inv_8_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_8_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul_op\n\t" + "mov r2, sp\n\t" + "ldr r1, [sp, #160]\n\t" + "ldr r0, [sp, #160]\n\t" + "bl fe_mul_op\n\t" + "mov r0, #0\n\t" + "add sp, sp, #0xbc\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr" + ); + return (uint32_t)(size_t)r; +} + +#else +int curve25519(byte* r_p, const byte* n_p, const byte* a_p) +{ + register byte* r asm ("r0") = r_p; + register const byte* n asm ("r1") = n_p; + register const byte* a asm ("r2") = a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0xc0\n\t" + "str %[r], [sp, #176]\n\t" + "str %[n], [sp, #160]\n\t" + "str %[a], [sp, #172]\n\t" + "add r5, sp, #0x40\n\t" + "add r4, sp, #32\n\t" + "str sp, [sp, #184]\n\t" + "str r5, [sp, #180]\n\t" + "str r4, [sp, #188]\n\t" + "mov %[n], #0\n\t" + "str %[n], [sp, #164]\n\t" + /* Set one */ + "mov r10, #1\n\t" + "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #8]\n\t" - "str r5, [sp, #12]\n\t" + "str r10, [%[r]]\n\t" + "str r11, [%[r], #4]\n\t" #else - "strd r4, r5, [sp, #8]\n\t" + "strd r10, r11, [%[r]]\n\t" #endif + "mov r10, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #40]\n\t" - "str r7, [sp, #44]\n\t" + "str r10, [%[r], #8]\n\t" + "str r11, [%[r], #12]\n\t" #else - "strd r6, r7, [sp, #40]\n\t" + "strd r10, r11, [%[r], #8]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #16]\n\t" - "ldr r5, [sp, #20]\n\t" + "str r10, [%[r], #16]\n\t" + "str r11, [%[r], #20]\n\t" #else - "ldrd r4, r5, [sp, #16]\n\t" + "strd r10, r11, [%[r], #16]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #48]\n\t" - "ldr r7, [sp, #52]\n\t" + "str r10, [%[r], #24]\n\t" + "str r11, [%[r], #28]\n\t" #else - "ldrd r6, r7, [sp, #48]\n\t" + "strd r10, r11, [%[r], #24]\n\t" #endif - "eor r8, r4, r6\n\t" - "eor r9, r5, r7\n\t" - "and r8, r8, %[n]\n\t" - "and r9, r9, %[n]\n\t" - "eor r4, r4, r8\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r8\n\t" - "eor r7, r7, r9\n\t" + /* Set zero */ + "mov r10, #0\n\t" + "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #16]\n\t" - "str r5, [sp, #20]\n\t" + "str r10, [sp]\n\t" + "str r11, [sp, #4]\n\t" #else - "strd r4, r5, [sp, #16]\n\t" + "strd r10, r11, [sp]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #48]\n\t" - "str r7, [sp, #52]\n\t" + "str r10, [sp, #8]\n\t" + "str r11, [sp, #12]\n\t" #else - "strd r6, r7, [sp, #48]\n\t" + "strd r10, r11, [sp, #8]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #24]\n\t" - "ldr r5, [sp, #28]\n\t" + "str r10, [sp, #16]\n\t" + "str r11, [sp, #20]\n\t" #else - "ldrd r4, r5, [sp, #24]\n\t" + "strd r10, r11, [sp, #16]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #56]\n\t" - "ldr r7, [sp, #60]\n\t" + "str r10, [sp, #24]\n\t" + "str r11, [sp, #28]\n\t" #else - "ldrd r6, r7, [sp, #56]\n\t" + "strd r10, r11, [sp, #24]\n\t" #endif - "eor r8, r4, r6\n\t" - "eor r9, r5, r7\n\t" - "and r8, r8, %[n]\n\t" - "and r9, r9, %[n]\n\t" - "eor r4, r4, r8\n\t" - "eor r5, r5, r9\n\t" - "eor r6, r6, r8\n\t" - "eor r7, r7, r9\n\t" + /* Set one */ + "mov r10, #1\n\t" + "mov r11, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #24]\n\t" - "str r5, [sp, #28]\n\t" + "str r10, [sp, #32]\n\t" + "str r11, [sp, #36]\n\t" #else - "strd r4, r5, [sp, #24]\n\t" + "strd r10, r11, [sp, #32]\n\t" #endif + "mov r10, #0\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #56]\n\t" - "str r7, [sp, #60]\n\t" + "str r10, [sp, #40]\n\t" + "str r11, [sp, #44]\n\t" #else - "strd r6, r7, [sp, #56]\n\t" + "strd r10, r11, [sp, #40]\n\t" #endif - "ldr %[n], [sp, #184]\n\t" - "str %[n], [sp, #172]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r]]\n\t" - "ldr r5, [%[r], #4]\n\t" -#else - "ldrd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp]\n\t" - "ldr r7, [sp, #4]\n\t" -#else - "ldrd r6, r7, [sp]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r]]\n\t" - "str r9, [%[r], #4]\n\t" -#else - "strd r8, r9, [%[r]]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #128]\n\t" - "str r11, [sp, #132]\n\t" -#else - "strd r10, r11, [sp, #128]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #8]\n\t" - "ldr r5, [%[r], #12]\n\t" -#else - "ldrd r4, r5, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #8]\n\t" - "ldr r7, [sp, #12]\n\t" -#else - "ldrd r6, r7, [sp, #8]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #8]\n\t" - "str r9, [%[r], #12]\n\t" -#else - "strd r8, r9, [%[r], #8]\n\t" -#endif - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #136]\n\t" - "str r11, [sp, #140]\n\t" -#else - "strd r10, r11, [sp, #136]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #16]\n\t" - "ldr r5, [%[r], #20]\n\t" -#else - "ldrd r4, r5, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #16]\n\t" - "ldr r7, [sp, #20]\n\t" -#else - "ldrd r6, r7, [sp, #16]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" -#else - "strd r8, r9, [%[r], #16]\n\t" -#endif - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #144]\n\t" - "str r11, [sp, #148]\n\t" -#else - "strd r10, r11, [sp, #144]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #24]\n\t" - "ldr r5, [%[r], #28]\n\t" -#else - "ldrd r4, r5, [%[r], #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #24]\n\t" - "ldr r7, [sp, #28]\n\t" -#else - "ldrd r6, r7, [sp, #24]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r3, #-19\n\t" - "asr %[a], r9, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r]]\n\t" - "ldr r5, [%[r], #4]\n\t" -#else - "ldrd r4, r5, [%[r]]\n\t" -#endif - "subs r4, r4, r3\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r]]\n\t" - "str r5, [%[r], #4]\n\t" -#else - "strd r4, r5, [%[r]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #8]\n\t" - "ldr r5, [%[r], #12]\n\t" -#else - "ldrd r4, r5, [%[r], #8]\n\t" -#endif - "sbcs r4, r4, %[a]\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" -#else - "strd r4, r5, [%[r], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [%[r], #16]\n\t" - "ldr r5, [%[r], #20]\n\t" -#else - "ldrd r4, r5, [%[r], #16]\n\t" -#endif - "sbcs r4, r4, %[a]\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" -#else - "strd r4, r5, [%[r], #16]\n\t" -#endif - "sbcs r8, r8, %[a]\n\t" - "sbc r9, r9, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #24]\n\t" - "str r9, [%[r], #28]\n\t" -#else - "strd r8, r9, [%[r], #24]\n\t" -#endif - "mov r3, #-19\n\t" - "asr %[a], r11, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #128]\n\t" - "ldr r5, [sp, #132]\n\t" -#else - "ldrd r4, r5, [sp, #128]\n\t" -#endif - "adds r4, r4, r3\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #128]\n\t" - "str r5, [sp, #132]\n\t" -#else - "strd r4, r5, [sp, #128]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #136]\n\t" - "ldr r5, [sp, #140]\n\t" -#else - "ldrd r4, r5, [sp, #136]\n\t" -#endif - "adcs r4, r4, %[a]\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #136]\n\t" - "str r5, [sp, #140]\n\t" -#else - "strd r4, r5, [sp, #136]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #144]\n\t" - "ldr r5, [sp, #148]\n\t" -#else - "ldrd r4, r5, [sp, #144]\n\t" -#endif - "adcs r4, r4, %[a]\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #144]\n\t" - "str r5, [sp, #148]\n\t" -#else - "strd r4, r5, [sp, #144]\n\t" -#endif - "adcs r10, r10, %[a]\n\t" - "adc r11, r11, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #152]\n\t" - "str r11, [sp, #156]\n\t" -#else - "strd r10, r11, [sp, #152]\n\t" -#endif - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #64]\n\t" - "ldr r5, [sp, #68]\n\t" -#else - "ldrd r4, r5, [sp, #64]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #32]\n\t" - "ldr r7, [sp, #36]\n\t" -#else - "ldrd r6, r7, [sp, #32]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp]\n\t" - "str r9, [sp, #4]\n\t" -#else - "strd r8, r9, [sp]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #96]\n\t" - "str r11, [sp, #100]\n\t" -#else - "strd r10, r11, [sp, #96]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #72]\n\t" - "ldr r5, [sp, #76]\n\t" -#else - "ldrd r4, r5, [sp, #72]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #40]\n\t" - "ldr r7, [sp, #44]\n\t" -#else - "ldrd r6, r7, [sp, #40]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #8]\n\t" - "str r9, [sp, #12]\n\t" -#else - "strd r8, r9, [sp, #8]\n\t" -#endif - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #104]\n\t" - "str r11, [sp, #108]\n\t" -#else - "strd r10, r11, [sp, #104]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #80]\n\t" - "ldr r5, [sp, #84]\n\t" -#else - "ldrd r4, r5, [sp, #80]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #48]\n\t" - "ldr r7, [sp, #52]\n\t" -#else - "ldrd r6, r7, [sp, #48]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #16]\n\t" - "str r9, [sp, #20]\n\t" -#else - "strd r8, r9, [sp, #16]\n\t" -#endif - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #112]\n\t" - "str r11, [sp, #116]\n\t" -#else - "strd r10, r11, [sp, #112]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #88]\n\t" - "ldr r5, [sp, #92]\n\t" -#else - "ldrd r4, r5, [sp, #88]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #56]\n\t" - "ldr r7, [sp, #60]\n\t" -#else - "ldrd r6, r7, [sp, #56]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r3, #-19\n\t" - "asr %[a], r9, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" -#else - "ldrd r4, r5, [sp]\n\t" -#endif - "subs r4, r4, r3\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp]\n\t" - "str r5, [sp, #4]\n\t" -#else - "strd r4, r5, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #8]\n\t" - "ldr r5, [sp, #12]\n\t" -#else - "ldrd r4, r5, [sp, #8]\n\t" -#endif - "sbcs r4, r4, %[a]\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #8]\n\t" - "str r5, [sp, #12]\n\t" -#else - "strd r4, r5, [sp, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #16]\n\t" - "ldr r5, [sp, #20]\n\t" -#else - "ldrd r4, r5, [sp, #16]\n\t" -#endif - "sbcs r4, r4, %[a]\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #16]\n\t" - "str r5, [sp, #20]\n\t" -#else - "strd r4, r5, [sp, #16]\n\t" -#endif - "sbcs r8, r8, %[a]\n\t" - "sbc r9, r9, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #24]\n\t" - "str r9, [sp, #28]\n\t" -#else - "strd r8, r9, [sp, #24]\n\t" -#endif - "mov r3, #-19\n\t" - "asr %[a], r11, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #96]\n\t" - "ldr r5, [sp, #100]\n\t" -#else - "ldrd r4, r5, [sp, #96]\n\t" -#endif - "adds r4, r4, r3\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #96]\n\t" - "str r5, [sp, #100]\n\t" -#else - "strd r4, r5, [sp, #96]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #104]\n\t" - "ldr r5, [sp, #108]\n\t" -#else - "ldrd r4, r5, [sp, #104]\n\t" -#endif - "adcs r4, r4, %[a]\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #104]\n\t" - "str r5, [sp, #108]\n\t" -#else - "strd r4, r5, [sp, #104]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #112]\n\t" - "ldr r5, [sp, #116]\n\t" -#else - "ldrd r4, r5, [sp, #112]\n\t" -#endif - "adcs r4, r4, %[a]\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #112]\n\t" - "str r5, [sp, #116]\n\t" -#else - "strd r4, r5, [sp, #112]\n\t" -#endif - "adcs r10, r10, %[a]\n\t" - "adc r11, r11, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #120]\n\t" - "str r11, [sp, #124]\n\t" -#else - "strd r10, r11, [sp, #120]\n\t" -#endif - "ldr r2, [sp, #160]\n\t" - "add r1, sp, #0x60\n\t" - "add r0, sp, #32\n\t" - "bl fe_mul\n\t" - "add r2, sp, #0x80\n\t" - "add r1, sp, #0\n\t" - "add r0, sp, #0\n\t" - "bl fe_mul\n\t" - "add r1, sp, #0x80\n\t" - "add r0, sp, #0x60\n\t" - "bl fe_sq\n\t" - "ldr r1, [sp, #160]\n\t" - "add r0, sp, #0x80\n\t" - "bl fe_sq\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #32]\n\t" - "ldr r5, [sp, #36]\n\t" -#else - "ldrd r4, r5, [sp, #32]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp]\n\t" - "ldr r7, [sp, #4]\n\t" -#else - "ldrd r6, r7, [sp]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #64]\n\t" - "str r9, [sp, #68]\n\t" -#else - "strd r8, r9, [sp, #64]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp]\n\t" - "str r11, [sp, #4]\n\t" -#else - "strd r10, r11, [sp]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #40]\n\t" - "ldr r5, [sp, #44]\n\t" -#else - "ldrd r4, r5, [sp, #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #8]\n\t" - "ldr r7, [sp, #12]\n\t" -#else - "ldrd r6, r7, [sp, #8]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #72]\n\t" - "str r9, [sp, #76]\n\t" -#else - "strd r8, r9, [sp, #72]\n\t" -#endif - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #8]\n\t" - "str r11, [sp, #12]\n\t" -#else - "strd r10, r11, [sp, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #48]\n\t" - "ldr r5, [sp, #52]\n\t" -#else - "ldrd r4, r5, [sp, #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #16]\n\t" - "ldr r7, [sp, #20]\n\t" -#else - "ldrd r6, r7, [sp, #16]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r3, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #80]\n\t" - "str r9, [sp, #84]\n\t" -#else - "strd r8, r9, [sp, #80]\n\t" -#endif - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov r12, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #16]\n\t" - "str r11, [sp, #20]\n\t" -#else - "strd r10, r11, [sp, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #56]\n\t" - "ldr r5, [sp, #60]\n\t" -#else - "ldrd r4, r5, [sp, #56]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #24]\n\t" - "ldr r7, [sp, #28]\n\t" -#else - "ldrd r6, r7, [sp, #24]\n\t" -#endif - "adds r3, r3, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds r12, r12, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r3, #-19\n\t" - "asr %[a], r9, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #64]\n\t" - "ldr r5, [sp, #68]\n\t" -#else - "ldrd r4, r5, [sp, #64]\n\t" -#endif - "subs r4, r4, r3\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #64]\n\t" - "str r5, [sp, #68]\n\t" -#else - "strd r4, r5, [sp, #64]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #72]\n\t" - "ldr r5, [sp, #76]\n\t" -#else - "ldrd r4, r5, [sp, #72]\n\t" -#endif - "sbcs r4, r4, %[a]\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #72]\n\t" - "str r5, [sp, #76]\n\t" -#else - "strd r4, r5, [sp, #72]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #80]\n\t" - "ldr r5, [sp, #84]\n\t" -#else - "ldrd r4, r5, [sp, #80]\n\t" -#endif - "sbcs r4, r4, %[a]\n\t" - "sbcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #80]\n\t" - "str r5, [sp, #84]\n\t" -#else - "strd r4, r5, [sp, #80]\n\t" -#endif - "sbcs r8, r8, %[a]\n\t" - "sbc r9, r9, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #88]\n\t" - "str r9, [sp, #92]\n\t" -#else - "strd r8, r9, [sp, #88]\n\t" -#endif - "mov r3, #-19\n\t" - "asr %[a], r11, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" -#else - "ldrd r4, r5, [sp]\n\t" -#endif - "adds r4, r4, r3\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp]\n\t" - "str r5, [sp, #4]\n\t" -#else - "strd r4, r5, [sp]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #8]\n\t" - "ldr r5, [sp, #12]\n\t" -#else - "ldrd r4, r5, [sp, #8]\n\t" -#endif - "adcs r4, r4, %[a]\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #8]\n\t" - "str r5, [sp, #12]\n\t" -#else - "strd r4, r5, [sp, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #16]\n\t" - "ldr r5, [sp, #20]\n\t" -#else - "ldrd r4, r5, [sp, #16]\n\t" -#endif - "adcs r4, r4, %[a]\n\t" - "adcs r5, r5, %[a]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #16]\n\t" - "str r5, [sp, #20]\n\t" -#else - "strd r4, r5, [sp, #16]\n\t" -#endif - "adcs r10, r10, %[a]\n\t" - "adc r11, r11, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #24]\n\t" - "str r11, [sp, #28]\n\t" -#else - "strd r10, r11, [sp, #24]\n\t" -#endif - "add r2, sp, #0x60\n\t" - "add r1, sp, #0x80\n\t" - "ldr r0, [sp, #160]\n\t" - "bl fe_mul\n\t" - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #128]\n\t" - "ldr r5, [sp, #132]\n\t" -#else - "ldrd r4, r5, [sp, #128]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #136]\n\t" - "ldr r7, [sp, #140]\n\t" -#else - "ldrd r6, r7, [sp, #136]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #96]\n\t" - "ldr r9, [sp, #100]\n\t" -#else - "ldrd r8, r9, [sp, #96]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #104]\n\t" - "ldr r11, [sp, #108]\n\t" -#else - "ldrd r10, r11, [sp, #104]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #128]\n\t" - "str r9, [sp, #132]\n\t" -#else - "strd r8, r9, [sp, #128]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #136]\n\t" - "str r11, [sp, #140]\n\t" -#else - "strd r10, r11, [sp, #136]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #144]\n\t" - "ldr r5, [sp, #148]\n\t" -#else - "ldrd r4, r5, [sp, #144]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #152]\n\t" - "ldr r7, [sp, #156]\n\t" -#else - "ldrd r6, r7, [sp, #152]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #112]\n\t" - "ldr r9, [sp, #116]\n\t" -#else - "ldrd r8, r9, [sp, #112]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #120]\n\t" - "ldr r11, [sp, #124]\n\t" -#else - "ldrd r10, r11, [sp, #120]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r3, #-19\n\t" - "asr %[a], r11, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #128]\n\t" - "ldr r5, [sp, #132]\n\t" -#else - "ldrd r4, r5, [sp, #128]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #136]\n\t" - "ldr r7, [sp, #140]\n\t" -#else - "ldrd r6, r7, [sp, #136]\n\t" -#endif - "adds r4, r4, r3\n\t" - "adcs r5, r5, %[a]\n\t" - "adcs r6, r6, %[a]\n\t" - "adcs r7, r7, %[a]\n\t" - "adcs r8, r8, %[a]\n\t" - "adcs r9, r9, %[a]\n\t" - "adcs r10, r10, %[a]\n\t" - "adc r11, r11, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #128]\n\t" - "str r5, [sp, #132]\n\t" -#else - "strd r4, r5, [sp, #128]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #136]\n\t" - "str r7, [sp, #140]\n\t" -#else - "strd r6, r7, [sp, #136]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #144]\n\t" - "str r9, [sp, #148]\n\t" -#else - "strd r8, r9, [sp, #144]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #152]\n\t" - "str r11, [sp, #156]\n\t" -#else - "strd r10, r11, [sp, #152]\n\t" -#endif - "add r1, sp, #0\n\t" - "add r0, sp, #0\n\t" - "bl fe_sq\n\t" - /* Multiply by 121666 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #128]\n\t" - "ldr r5, [sp, #132]\n\t" -#else - "ldrd r4, r5, [sp, #128]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #136]\n\t" - "ldr r7, [sp, #140]\n\t" -#else - "ldrd r6, r7, [sp, #136]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #144]\n\t" - "ldr r9, [sp, #148]\n\t" -#else - "ldrd r8, r9, [sp, #144]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #152]\n\t" - "ldr r11, [sp, #156]\n\t" -#else - "ldrd r10, r11, [sp, #152]\n\t" -#endif - "movw r12, #0xdb42\n\t" - "movt r12, #1\n\t" - "umull r4, %[a], r4, r12\n\t" - "umull r5, r3, r5, r12\n\t" - "adds r5, r5, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r6, r3, r6, r12\n\t" - "adds r6, r6, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r7, r3, r7, r12\n\t" - "adds r7, r7, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r8, r3, r8, r12\n\t" - "adds r8, r8, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r9, r3, r9, r12\n\t" - "adds r9, r9, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r10, r3, r10, r12\n\t" - "adds r10, r10, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "umull r11, r3, r11, r12\n\t" - "adds r11, r11, %[a]\n\t" - "adc %[a], r3, #0\n\t" - "mov r12, #19\n\t" - "lsl %[a], %[a], #1\n\t" - "orr %[a], %[a], r11, lsr #31\n\t" - "mul %[a], %[a], r12\n\t" - "and r11, r11, #0x7fffffff\n\t" - "adds r4, r4, %[a]\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adcs r7, r7, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, #0\n\t" - "adcs r10, r10, #0\n\t" - "adc r11, r11, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #32]\n\t" - "str r5, [sp, #36]\n\t" -#else - "strd r4, r5, [sp, #32]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #40]\n\t" - "str r7, [sp, #44]\n\t" -#else - "strd r6, r7, [sp, #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #48]\n\t" - "str r9, [sp, #52]\n\t" -#else - "strd r8, r9, [sp, #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #56]\n\t" - "str r11, [sp, #60]\n\t" -#else - "strd r10, r11, [sp, #56]\n\t" -#endif - "add r1, sp, #0x40\n\t" - "add r0, sp, #0x40\n\t" - "bl fe_sq\n\t" - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #96]\n\t" - "ldr r5, [sp, #100]\n\t" -#else - "ldrd r4, r5, [sp, #96]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #104]\n\t" - "ldr r7, [sp, #108]\n\t" -#else - "ldrd r6, r7, [sp, #104]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #32]\n\t" - "ldr r9, [sp, #36]\n\t" -#else - "ldrd r8, r9, [sp, #32]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #40]\n\t" - "ldr r11, [sp, #44]\n\t" -#else - "ldrd r10, r11, [sp, #40]\n\t" -#endif - "adds r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #96]\n\t" - "str r9, [sp, #100]\n\t" -#else - "strd r8, r9, [sp, #96]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #104]\n\t" - "str r11, [sp, #108]\n\t" -#else - "strd r10, r11, [sp, #104]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #112]\n\t" - "ldr r5, [sp, #116]\n\t" -#else - "ldrd r4, r5, [sp, #112]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #120]\n\t" - "ldr r7, [sp, #124]\n\t" -#else - "ldrd r6, r7, [sp, #120]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [sp, #48]\n\t" - "ldr r9, [sp, #52]\n\t" -#else - "ldrd r8, r9, [sp, #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [sp, #56]\n\t" - "ldr r11, [sp, #60]\n\t" -#else - "ldrd r10, r11, [sp, #56]\n\t" -#endif - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r3, #-19\n\t" - "asr %[a], r11, #31\n\t" - /* Mask the modulus */ - "and r3, %[a], r3\n\t" - "and r12, %[a], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [sp, #96]\n\t" - "ldr r5, [sp, #100]\n\t" -#else - "ldrd r4, r5, [sp, #96]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [sp, #104]\n\t" - "ldr r7, [sp, #108]\n\t" -#else - "ldrd r6, r7, [sp, #104]\n\t" -#endif - "subs r4, r4, r3\n\t" - "sbcs r5, r5, %[a]\n\t" - "sbcs r6, r6, %[a]\n\t" - "sbcs r7, r7, %[a]\n\t" - "sbcs r8, r8, %[a]\n\t" - "sbcs r9, r9, %[a]\n\t" - "sbcs r10, r10, %[a]\n\t" - "sbc r11, r11, r12\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [sp, #96]\n\t" - "str r5, [sp, #100]\n\t" -#else - "strd r4, r5, [sp, #96]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [sp, #104]\n\t" - "str r7, [sp, #108]\n\t" -#else - "strd r6, r7, [sp, #104]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [sp, #112]\n\t" - "str r9, [sp, #116]\n\t" -#else - "strd r8, r9, [sp, #112]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [sp, #120]\n\t" - "str r11, [sp, #124]\n\t" -#else - "strd r10, r11, [sp, #120]\n\t" -#endif - "add r2, sp, #0\n\t" - "ldr r1, [sp, #168]\n\t" - "add r0, sp, #32\n\t" - "bl fe_mul\n\t" - "add r2, sp, #0x60\n\t" - "add r1, sp, #0x80\n\t" - "add r0, sp, #0\n\t" - "bl fe_mul\n\t" - "ldr %[a], [sp, #176]\n\t" - "ldr %[n], [sp, #180]\n\t" - "subs %[n], %[n], #1\n\t" - "str %[n], [sp, #180]\n\t" - "bge L_curve25519_bits_%=\n\t" - "mov %[n], #31\n\t" - "str %[n], [sp, #180]\n\t" - "subs %[a], %[a], #4\n\t" - "str %[a], [sp, #176]\n\t" - "bge L_curve25519_words_%=\n\t" - /* Invert */ - "add r0, sp, #32\n\t" - "add r1, sp, #0\n\t" - "bl fe_sq\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #0x60\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "mov r4, #4\n\t" - "\n" - "L_curve25519_inv_1_%=: \n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_1_%=\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x60\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "mov r4, #9\n\t" - "\n" - "L_curve25519_inv_2_%=: \n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_2_%=\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x80\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "mov r4, #19\n\t" - "\n" - "L_curve25519_inv_3_%=: \n\t" - "add r0, sp, #0x80\n\t" - "add r1, sp, #0x80\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_3_%=\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x80\n\t" - "add r2, sp, #0x60\n\t" - "bl fe_mul\n\t" - "mov r4, #10\n\t" - "\n" - "L_curve25519_inv_4_%=: \n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_4_%=\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x60\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "mov r4, #49\n\t" - "\n" - "L_curve25519_inv_5_%=: \n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_5_%=\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x80\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "mov r4, #0x63\n\t" - "\n" - "L_curve25519_inv_6_%=: \n\t" - "add r0, sp, #0x80\n\t" - "add r1, sp, #0x80\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_6_%=\n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x80\n\t" - "add r2, sp, #0x60\n\t" - "bl fe_mul\n\t" - "mov r4, #50\n\t" - "\n" - "L_curve25519_inv_7_%=: \n\t" - "add r0, sp, #0x60\n\t" - "add r1, sp, #0x60\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_7_%=\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x60\n\t" - "add r2, sp, #0x40\n\t" - "bl fe_mul\n\t" - "mov r4, #5\n\t" - "\n" - "L_curve25519_inv_8_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_curve25519_inv_8_%=\n\t" - "add r0, sp, #0\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "add r2, sp, #0\n\t" - "ldr r1, [sp, #160]\n\t" - "ldr r0, [sp, #160]\n\t" - "bl fe_mul\n\t" - "mov r0, #0\n\t" - "add sp, sp, #0xbc\n\t" - : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) - : - : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" - ); - return (uint32_t)(size_t)r; -} - -void fe_pow22523(fe r_p, const fe a_p) -{ - register fe r asm ("r0") = r_p; - register const fe a asm ("r1") = a_p; - - __asm__ __volatile__ ( - "sub sp, sp, #0x68\n\t" - /* pow22523 */ - "str %[r], [sp, #96]\n\t" - "str %[a], [sp, #100]\n\t" - "mov r0, sp\n\t" - "ldr r1, [sp, #100]\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "add r0, sp, #32\n\t" - "ldr r1, [sp, #100]\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r4, #4\n\t" - "\n" - "L_fe_pow22523_1_%=: \n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_1_%=\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r4, #9\n\t" - "\n" - "L_fe_pow22523_2_%=: \n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_2_%=\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #19\n\t" - "\n" - "L_fe_pow22523_3_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_3_%=\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r4, #10\n\t" - "\n" - "L_fe_pow22523_4_%=: \n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_4_%=\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #32\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "mov r4, #49\n\t" - "\n" - "L_fe_pow22523_5_%=: \n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_5_%=\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "mov r4, #0x63\n\t" - "\n" - "L_fe_pow22523_6_%=: \n\t" - "add r0, sp, #0x40\n\t" - "add r1, sp, #0x40\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_6_%=\n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #0x40\n\t" - "add r2, sp, #32\n\t" - "bl fe_mul\n\t" - "mov r4, #50\n\t" - "\n" - "L_fe_pow22523_7_%=: \n\t" - "add r0, sp, #32\n\t" - "add r1, sp, #32\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_7_%=\n\t" - "mov r0, sp\n\t" - "add r1, sp, #32\n\t" - "mov r2, sp\n\t" - "bl fe_mul\n\t" - "mov r4, #2\n\t" - "\n" - "L_fe_pow22523_8_%=: \n\t" - "mov r0, sp\n\t" - "mov r1, sp\n\t" - "bl fe_sq\n\t" - "sub r4, r4, #1\n\t" - "cmp r4, #0\n\t" - "bne L_fe_pow22523_8_%=\n\t" - "ldr r0, [sp, #96]\n\t" - "mov r1, sp\n\t" - "ldr r2, [sp, #100]\n\t" - "bl fe_mul\n\t" - "ldr %[a], [sp, #100]\n\t" - "ldr %[r], [sp, #96]\n\t" - "add sp, sp, #0x68\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "lr", "r4" - ); -} - -void fe_ge_to_p2(fe rx_p, fe ry_p, fe rz_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p) -{ - register fe rx asm ("r0") = rx_p; - register fe ry asm ("r1") = ry_p; - register fe rz asm ("r2") = rz_p; - register const fe px asm ("r3") = px_p; - register const fe py asm ("r4") = py_p; - register const fe pz asm ("r5") = pz_p; - register const fe pt asm ("r6") = pt_p; - - __asm__ __volatile__ ( - "sub sp, sp, #16\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[px], [sp, #12]\n\t" - "ldr r2, [sp, #28]\n\t" - "ldr r1, [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #24]\n\t" - "ldr r1, [sp, #20]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #28]\n\t" - "ldr r1, [sp, #24]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px) - : - : "memory", "lr" - ); - (void)py_p; - (void)pz_p; - (void)pt_p; -} - -void fe_ge_to_p3(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p) -{ - register fe rx asm ("r0") = rx_p; - register fe ry asm ("r1") = ry_p; - register fe rz asm ("r2") = rz_p; - register fe rt asm ("r3") = rt_p; - register const fe px asm ("r4") = px_p; - register const fe py asm ("r5") = py_p; - register const fe pz asm ("r6") = pz_p; - register const fe pt asm ("r7") = pt_p; - - __asm__ __volatile__ ( - "sub sp, sp, #16\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r2, [sp, #32]\n\t" - "ldr r1, [sp, #20]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #28]\n\t" - "ldr r1, [sp, #24]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #32]\n\t" - "ldr r1, [sp, #28]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #24]\n\t" - "ldr r1, [sp, #20]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "lr" - ); - (void)px_p; - (void)py_p; - (void)pz_p; - (void)pt_p; -} - -void fe_ge_dbl(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p) -{ - register fe rx asm ("r0") = rx_p; - register fe ry asm ("r1") = ry_p; - register fe rz asm ("r2") = rz_p; - register fe rt asm ("r3") = rt_p; - register const fe px asm ("r4") = px_p; - register const fe py asm ("r5") = py_p; - register const fe pz asm ("r6") = pz_p; - - __asm__ __volatile__ ( - "sub sp, sp, #16\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r1, [sp, #52]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_sq\n\t" - "ldr r1, [sp, #56]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_sq\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #52]\n\t" - "ldr r2, [sp, #56]\n\t" - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "adds r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_sq\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #8]\n\t" - "ldr r2, [sp]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r2]\n\t" - "ldr r7, [r2, #4]\n\t" -#else - "ldrd r6, r7, [r2]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r2, #8]\n\t" - "ldr r7, [r2, #12]\n\t" -#else - "ldrd r6, r7, [r2, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r2, #16]\n\t" - "ldr r7, [r2, #20]\n\t" -#else - "ldrd r6, r7, [r2, #16]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #24]\n\t" - "ldr r5, [r1, #28]\n\t" -#else - "ldrd r4, r5, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r2, #24]\n\t" - "ldr r7, [r2, #28]\n\t" -#else - "ldrd r6, r7, [r2, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" -#else - "strd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" -#else - "strd r8, r9, [r0, #24]\n\t" -#endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" -#else - "strd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" -#else - "strd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" -#else - "strd r4, r5, [r1, #16]\n\t" -#endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" -#else - "strd r10, r11, [r1, #24]\n\t" -#endif - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #12]\n\t" - "ldr r2, [sp, #4]\n\t" - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" - "adcs r6, r6, %[rt]\n\t" - "adcs r7, r7, %[rt]\n\t" - "adcs r8, r8, %[rt]\n\t" - "adcs r9, r9, %[rt]\n\t" - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r1, [sp, #60]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_sq2\n\t" - "ldr r0, [sp, #12]\n\t" - "ldr r1, [sp, #8]\n\t" - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r1]\n\t" - "ldr r9, [r1, #4]\n\t" -#else - "ldrd r8, r9, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r1, #8]\n\t" - "ldr r11, [r1, #12]\n\t" -#else - "ldrd r10, r11, [r1, #8]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #24]\n\t" - "ldr r7, [r0, #28]\n\t" -#else - "ldrd r6, r7, [r0, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r1, #16]\n\t" - "ldr r9, [r1, #20]\n\t" -#else - "ldrd r8, r9, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r1, #24]\n\t" - "ldr r11, [r1, #28]\n\t" -#else - "ldrd r10, r11, [r1, #24]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" - "adcs r6, r6, %[rt]\n\t" - "adcs r7, r7, %[rt]\n\t" - "adcs r8, r8, %[rt]\n\t" - "adcs r9, r9, %[rt]\n\t" - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "add sp, sp, #16\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" - ); - (void)px_p; - (void)py_p; - (void)pz_p; -} - -void fe_ge_madd(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qxy2d_p, const fe qyplusx_p, const fe qyminusx_p) -{ - register fe rx asm ("r0") = rx_p; - register fe ry asm ("r1") = ry_p; - register fe rz asm ("r2") = rz_p; - register fe rt asm ("r3") = rt_p; - register const fe px asm ("r4") = px_p; - register const fe py asm ("r5") = py_p; - register const fe pz asm ("r6") = pz_p; - register const fe pt asm ("r7") = pt_p; - register const fe qxy2d asm ("r8") = qxy2d_p; - register const fe qyplusx asm ("r9") = qyplusx_p; - register const fe qyminusx asm ("r10") = qyminusx_p; - - __asm__ __volatile__ ( - "sub sp, sp, #32\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "adds r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" - "adcs r6, r6, %[rt]\n\t" - "adcs r7, r7, %[rt]\n\t" - "adcs r8, r8, %[rt]\n\t" - "adcs r9, r9, %[rt]\n\t" - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r2, [sp, #88]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #92]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #80]\n\t" - "ldr r1, [sp, #84]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" - "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2]\n\t" - "ldr r5, [r2, #4]\n\t" -#else - "ldrd r4, r5, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0]\n\t" - "ldr r7, [r0, #4]\n\t" -#else - "ldrd r6, r7, [r0]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #8]\n\t" - "ldr r5, [r2, #12]\n\t" -#else - "ldrd r4, r5, [r2, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #16]\n\t" - "ldr r5, [r2, #20]\n\t" -#else - "ldrd r4, r5, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #16]\n\t" - "ldr r7, [r0, #20]\n\t" -#else - "ldrd r6, r7, [r0, #16]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #24]\n\t" - "ldr r5, [r2, #28]\n\t" -#else - "ldrd r4, r5, [r2, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #24]\n\t" - "ldr r7, [r0, #28]\n\t" -#else - "ldrd r6, r7, [r0, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" -#else - "strd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" -#else - "strd r8, r9, [r0, #24]\n\t" -#endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" -#else - "strd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" -#else - "strd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" -#else - "strd r4, r5, [r1, #16]\n\t" -#endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" -#else - "strd r10, r11, [r1, #24]\n\t" -#endif - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #76]\n\t" - /* Double */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r1, #16]\n\t" - "ldr r9, [r1, #20]\n\t" -#else - "ldrd r8, r9, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r1, #24]\n\t" - "ldr r11, [r1, #28]\n\t" -#else - "ldrd r10, r11, [r1, #24]\n\t" -#endif - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #12]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1]\n\t" - "ldr r7, [r1, #4]\n\t" -#else - "ldrd r6, r7, [r1]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #16]\n\t" - "ldr r7, [r1, #20]\n\t" -#else - "ldrd r6, r7, [r1, #16]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #24]\n\t" - "ldr r5, [r0, #28]\n\t" -#else - "ldrd r4, r5, [r0, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" -#else - "strd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" -#else - "strd r8, r9, [r0, #24]\n\t" -#endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" -#else - "strd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" -#else - "strd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" -#else - "strd r4, r5, [r1, #16]\n\t" -#endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" -#else - "strd r10, r11, [r1, #24]\n\t" -#endif - "add sp, sp, #32\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" - ); - (void)px_p; - (void)py_p; - (void)pz_p; - (void)pt_p; - (void)qxy2d_p; - (void)qyplusx_p; - (void)qyminusx_p; -} - -void fe_ge_msub(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qxy2d_p, const fe qyplusx_p, const fe qyminusx_p) -{ - register fe rx asm ("r0") = rx_p; - register fe ry asm ("r1") = ry_p; - register fe rz asm ("r2") = rz_p; - register fe rt asm ("r3") = rt_p; - register const fe px asm ("r4") = px_p; - register const fe py asm ("r5") = py_p; - register const fe pz asm ("r6") = pz_p; - register const fe pt asm ("r7") = pt_p; - register const fe qxy2d asm ("r8") = qxy2d_p; - register const fe qyplusx asm ("r9") = qyplusx_p; - register const fe qyminusx asm ("r10") = qyminusx_p; - - __asm__ __volatile__ ( - "sub sp, sp, #32\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "adds r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #72]\n\t" - "ldr r2, [sp, #68]\n\t" - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" - "adcs r6, r6, %[rt]\n\t" - "adcs r7, r7, %[rt]\n\t" - "adcs r8, r8, %[rt]\n\t" - "adcs r9, r9, %[rt]\n\t" - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r2, [sp, #92]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #88]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #80]\n\t" - "ldr r1, [sp, #84]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" - "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2]\n\t" - "ldr r5, [r2, #4]\n\t" -#else - "ldrd r4, r5, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0]\n\t" - "ldr r7, [r0, #4]\n\t" -#else - "ldrd r6, r7, [r0]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #8]\n\t" - "ldr r5, [r2, #12]\n\t" -#else - "ldrd r4, r5, [r2, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #16]\n\t" - "ldr r5, [r2, #20]\n\t" -#else - "ldrd r4, r5, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #16]\n\t" - "ldr r7, [r0, #20]\n\t" -#else - "ldrd r6, r7, [r0, #16]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #24]\n\t" - "ldr r5, [r2, #28]\n\t" -#else - "ldrd r4, r5, [r2, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #24]\n\t" - "ldr r7, [r0, #28]\n\t" -#else - "ldrd r6, r7, [r0, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" -#else - "strd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" -#else - "strd r8, r9, [r0, #24]\n\t" -#endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" -#else - "strd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" -#else - "strd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" -#else - "strd r4, r5, [r1, #16]\n\t" -#endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" -#else - "strd r10, r11, [r1, #24]\n\t" -#endif - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #76]\n\t" - /* Double */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r1, #16]\n\t" - "ldr r9, [r1, #20]\n\t" -#else - "ldrd r8, r9, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r1, #24]\n\t" - "ldr r11, [r1, #28]\n\t" -#else - "ldrd r10, r11, [r1, #24]\n\t" -#endif - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #12]\n\t" - "ldr r1, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0]\n\t" - "ldr r7, [r0, #4]\n\t" -#else - "ldrd r6, r7, [r0]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #16]\n\t" - "ldr r7, [r0, #20]\n\t" -#else - "ldrd r6, r7, [r0, #16]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #24]\n\t" - "ldr r5, [r1, #28]\n\t" -#else - "ldrd r4, r5, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #24]\n\t" - "ldr r7, [r0, #28]\n\t" -#else - "ldrd r6, r7, [r0, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" -#else - "strd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" -#else - "strd r8, r9, [r0, #24]\n\t" -#endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" -#else - "strd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" -#else - "strd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" -#else - "strd r4, r5, [r1, #16]\n\t" -#endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" -#else - "strd r10, r11, [r1, #24]\n\t" -#endif - "add sp, sp, #32\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) - : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" - ); - (void)px_p; - (void)py_p; - (void)pz_p; - (void)pt_p; - (void)qxy2d_p; - (void)qyplusx_p; - (void)qyminusx_p; -} - -void fe_ge_add(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qz_p, const fe qt2d_p, const fe qyplusx_p, const fe qyminusx_p) -{ - register fe rx asm ("r0") = rx_p; - register fe ry asm ("r1") = ry_p; - register fe rz asm ("r2") = rz_p; - register fe rt asm ("r3") = rt_p; - register const fe px asm ("r4") = px_p; - register const fe py asm ("r5") = py_p; - register const fe pz asm ("r6") = pz_p; - register const fe pt asm ("r7") = pt_p; - register const fe qz asm ("r8") = qz_p; - register const fe qt2d asm ("r9") = qt2d_p; - register const fe qyplusx asm ("r10") = qyplusx_p; - register const fe qyminusx asm ("r11") = qyminusx_p; - - __asm__ __volatile__ ( - "sub sp, sp, #0x60\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" - "ldr r0, [sp]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "adds r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" - "adcs r6, r6, %[rt]\n\t" - "adcs r7, r7, %[rt]\n\t" - "adcs r8, r8, %[rt]\n\t" - "adcs r9, r9, %[rt]\n\t" - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r2, [sp, #156]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #160]\n\t" - "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #144]\n\t" - "ldr r1, [sp, #152]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #148]\n\t" - "ldr r1, [sp, #140]\n\t" - "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "add r0, sp, #16\n\t" - "ldr r1, [sp]\n\t" - /* Double */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r1, #16]\n\t" - "ldr r9, [r1, #20]\n\t" -#else - "ldrd r8, r9, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r1, #24]\n\t" - "ldr r11, [r1, #28]\n\t" -#else - "ldrd r10, r11, [r1, #24]\n\t" -#endif - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" - "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2]\n\t" - "ldr r5, [r2, #4]\n\t" -#else - "ldrd r4, r5, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0]\n\t" - "ldr r7, [r0, #4]\n\t" -#else - "ldrd r6, r7, [r0]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #8]\n\t" - "ldr r5, [r2, #12]\n\t" -#else - "ldrd r4, r5, [r2, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #16]\n\t" - "ldr r5, [r2, #20]\n\t" -#else - "ldrd r4, r5, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #16]\n\t" - "ldr r7, [r0, #20]\n\t" -#else - "ldrd r6, r7, [r0, #16]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #24]\n\t" - "ldr r5, [r2, #28]\n\t" -#else - "ldrd r4, r5, [r2, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #24]\n\t" - "ldr r7, [r0, #28]\n\t" -#else - "ldrd r6, r7, [r0, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" -#else - "strd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" -#else - "strd r8, r9, [r0, #24]\n\t" -#endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" -#else - "strd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" -#else - "strd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" -#else - "strd r4, r5, [r1, #16]\n\t" -#endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" -#else - "strd r10, r11, [r1, #24]\n\t" -#endif - "ldr r0, [sp, #8]\n\t" - "ldr r1, [sp, #12]\n\t" - "add r2, sp, #16\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2]\n\t" - "ldr r5, [r2, #4]\n\t" -#else - "ldrd r4, r5, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1]\n\t" - "ldr r7, [r1, #4]\n\t" -#else - "ldrd r6, r7, [r1]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #8]\n\t" - "ldr r5, [r2, #12]\n\t" -#else - "ldrd r4, r5, [r2, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #16]\n\t" - "ldr r5, [r2, #20]\n\t" + "str r10, [sp, #48]\n\t" + "str r11, [sp, #52]\n\t" #else - "ldrd r4, r5, [r2, #16]\n\t" + "strd r10, r11, [sp, #48]\n\t" #endif #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #16]\n\t" - "ldr r7, [r1, #20]\n\t" + "str r10, [sp, #56]\n\t" + "str r11, [sp, #60]\n\t" #else - "ldrd r6, r7, [r1, #16]\n\t" + "strd r10, r11, [sp, #56]\n\t" #endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" + "add r3, sp, #0x40\n\t" + /* Copy */ + "ldm r2, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "stm r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "mov %[a], #0xfe\n\t" + "\n" + "L_curve25519_bits_%=: \n\t" + "str %[a], [sp, #168]\n\t" + "ldr %[n], [sp, #160]\n\t" + "and r4, %[a], #31\n\t" + "lsr %[a], %[a], #5\n\t" + "ldr %[a], [%[n], r2, lsl #2]\n\t" + "rsb r4, r4, #31\n\t" + "lsl %[a], %[a], r4\n\t" + "ldr %[n], [sp, #164]\n\t" + "eor %[n], %[n], %[a]\n\t" + "asr %[n], %[n], #31\n\t" + "str %[a], [sp, #164]\n\t" + /* Conditional Swap */ + "add r11, sp, #0xb0\n\t" + "ldm r11, {r4, r5, r6, r7}\n\t" + "eor r8, r4, r5\n\t" + "eor r9, r6, r7\n\t" + "and r8, r8, %[n]\n\t" + "and r9, r9, %[n]\n\t" + "eor r4, r4, r8\n\t" + "eor r5, r5, r8\n\t" + "eor r6, r6, r9\n\t" + "eor r7, r7, r9\n\t" + "stm r11, {r4, r5, r6, r7}\n\t" + /* Ladder step */ + "ldr r3, [sp, #184]\n\t" + "ldr r2, [sp, #176]\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #176]\n\t" + "bl fe_add_sub_op\n\t" + "ldr r3, [sp, #188]\n\t" + "ldr r2, [sp, #180]\n\t" + "add r1, sp, #0x60\n\t" + "ldr r0, [sp, #184]\n\t" + "bl fe_add_sub_op\n\t" + "ldr r2, [sp, #176]\n\t" + "add r1, sp, #0x60\n\t" + "ldr r0, [sp, #188]\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x80\n\t" + "ldr r1, [sp, #184]\n\t" + "ldr r0, [sp, #184]\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "ldr r1, [sp, #176]\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq_op\n\t" + "ldr r3, [sp, #184]\n\t" + "ldr r2, [sp, #188]\n\t" + "ldr r1, [sp, #184]\n\t" + "ldr r0, [sp, #180]\n\t" + "bl fe_add_sub_op\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #176]\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sub_op\n\t" + "ldr r1, [sp, #184]\n\t" + "ldr r0, [sp, #184]\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #188]\n\t" + "bl fe_mul121666\n\t" + "ldr r1, [sp, #180]\n\t" + "ldr r0, [sp, #180]\n\t" + "bl fe_sq_op\n\t" + "ldr r2, [sp, #188]\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_add_op\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #172]\n\t" + "ldr r0, [sp, #188]\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "ldr r0, [sp, #184]\n\t" + "bl fe_mul_op\n\t" + "ldr %[a], [sp, #168]\n\t" + "subs %[a], %[a], #1\n\t" + "bge L_curve25519_bits_%=\n\t" + "ldr %[n], [sp, #184]\n\t" + /* Copy */ + "ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "stm sp, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Invert */ + "add r1, sp, #0\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #4\n\t" + "\n" + "L_curve25519_inv_1_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_1_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #9\n\t" + "\n" + "L_curve25519_inv_2_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_2_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq_op\n\t" + "mov r12, #19\n\t" + "\n" + "L_curve25519_inv_3_%=: \n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x80\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_3_%=\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "mov r12, #10\n\t" + "\n" + "L_curve25519_inv_4_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_4_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #49\n\t" + "\n" + "L_curve25519_inv_5_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_5_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x80\n\t" + "bl fe_sq_op\n\t" + "mov r12, #0x63\n\t" + "\n" + "L_curve25519_inv_6_%=: \n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x80\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_6_%=\n\t" + "add r2, sp, #0x60\n\t" + "add r1, sp, #0x80\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_mul_op\n\t" + "mov r12, #50\n\t" + "\n" + "L_curve25519_inv_7_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_7_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "mov r12, #5\n\t" + "\n" + "L_curve25519_inv_8_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_curve25519_inv_8_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0\n\t" + "bl fe_mul_op\n\t" + "ldr r2, [sp, #184]\n\t" + "ldr r1, [sp, #176]\n\t" + "ldr r0, [sp, #176]\n\t" + "bl fe_mul_op\n\t" + /* Ensure result is less than modulus */ + "ldr %[r], [sp, #176]\n\t" + "ldm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "mov %[a], #19\n\t" + "and %[a], %[a], r11, asr #31\n\t" + "adds r4, r4, %[a]\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "bfc r11, #31, #1\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "mov r0, #0\n\t" + "add sp, sp, #0xc0\n\t" + : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WC_NO_CACHE_RESISTANT */ +#ifdef HAVE_ED25519 +void fe_invert(fe r_p, const fe a_p) +{ + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x88\n\t" + /* Invert */ + "str %[r], [sp, #128]\n\t" + "str %[a], [sp, #132]\n\t" + "ldr r1, [sp, #132]\n\t" + "mov r0, sp\n\t" + "bl fe_sq_op\n\t" + "mov r1, sp\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #32\n\t" + "ldr r1, [sp, #132]\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #32\n\t" + "mov r1, sp\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "mov r1, sp\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "mov r12, #4\n\t" + "\n" + "L_fe_invert1_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert1_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "mov r12, #9\n\t" + "\n" + "L_fe_invert2_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert2_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #19\n\t" + "\n" + "L_fe_invert3_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert3_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "mov r12, #10\n\t" + "\n" + "L_fe_invert4_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert4_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "mov r12, #49\n\t" + "\n" + "L_fe_invert5_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert5_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x60\n\t" + "bl fe_sq_op\n\t" + "mov r12, #0x63\n\t" + "\n" + "L_fe_invert6_%=: \n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x60\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert6_%=\n\t" + "add r2, sp, #0x40\n\t" + "add r1, sp, #0x60\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_mul_op\n\t" + "mov r12, #50\n\t" + "\n" + "L_fe_invert7_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert7_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "mov r12, #5\n\t" + "\n" + "L_fe_invert8_%=: \n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_invert8_%=\n\t" + "mov r2, sp\n\t" + "add r1, sp, #32\n\t" + "ldr r0, [sp, #128]\n\t" + "bl fe_mul_op\n\t" + "ldr %[a], [sp, #132]\n\t" + "ldr %[r], [sp, #128]\n\t" + "add sp, sp, #0x88\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_sq2(fe r_p, const fe a_p) +{ + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #36\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" + "str r0, [sp, #28]\n\t" + "str r1, [sp, #32]\n\t" #else - "strd r8, r9, [r0, #16]\n\t" + "strd r0, r1, [sp, #28]\n\t" #endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" + "ldm r1, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + /* Square * 2 */ + "umull r9, r10, r0, r0\n\t" + "umull r11, r12, r0, r1\n\t" + "adds r11, r11, r11\n\t" "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #24]\n\t" - "ldr r5, [r2, #28]\n\t" -#else - "ldrd r4, r5, [r2, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" -#else - "ldrd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" -#else - "strd r4, r5, [r0, #16]\n\t" -#endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" -#else - "strd r8, r9, [r0, #24]\n\t" -#endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" -#else - "strd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" -#else - "ldrd r4, r5, [r1, #8]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" -#else - "strd r4, r5, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" -#else - "strd r4, r5, [r1, #16]\n\t" -#endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" + "umaal r10, r11, lr, lr\n\t" + "stm sp, {r9, r10}\n\t" + "mov r8, lr\n\t" + "umaal r8, r12, r0, r2\n\t" + "adcs r8, r8, r8\n\t" + "umaal r8, r11, r1, r1\n\t" + "umull r9, r10, r0, r3\n\t" + "umaal r9, r12, r1, r2\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" + "str r8, [sp, #8]\n\t" + "str r9, [sp, #12]\n\t" #else - "strd r10, r11, [r1, #24]\n\t" + "strd r8, r9, [sp, #8]\n\t" #endif - "add sp, sp, #0x60\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + "mov r9, lr\n\t" + "umaal r9, r10, r0, r4\n\t" + "umaal r9, r12, r1, r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r2, r2\n\t" + "str r9, [sp, #16]\n\t" + "umull r9, r8, r0, r5\n\t" + "umaal r9, r12, r1, r4\n\t" + "umaal r9, r10, r2, r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" + "str r9, [sp, #20]\n\t" + "mov r9, lr\n\t" + "umaal r9, r8, r0, r6\n\t" + "umaal r9, r12, r1, r5\n\t" + "umaal r9, r10, r2, r4\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r3, r3\n\t" + "str r9, [sp, #24]\n\t" + "umull r0, r9, r0, r7\n\t" + "umaal r0, r8, r1, r6\n\t" + "umaal r0, r12, r2, r5\n\t" + "umaal r0, r10, r3, r4\n\t" + "adcs r0, r0, r0\n\t" + "umaal r0, r11, lr, lr\n\t" + /* R[7] = r0 */ + "umaal r9, r8, r1, r7\n\t" + "umaal r9, r10, r2, r6\n\t" + "umaal r12, r9, r3, r5\n\t" + "adcs r12, r12, r12\n\t" + "umaal r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "umaal r9, r8, r2, r7\n\t" + "umaal r10, r9, r3, r6\n\t" + "mov r2, lr\n\t" + "umaal r10, r2, r4, r5\n\t" + "adcs r10, r10, r10\n\t" + "umaal r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "umaal r2, r8, r3, r7\n\t" + "umaal r2, r9, r4, r6\n\t" + "adcs r3, r2, r2\n\t" + "umaal r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "mov r1, lr\n\t" + "umaal r1, r8, r4, r7\n\t" + "umaal r1, r9, r5, r6\n\t" + "adcs r4, r1, r1\n\t" + "umaal r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "umaal r8, r9, r5, r7\n\t" + "adcs r8, r8, r8\n\t" + "umaal r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "mov r5, lr\n\t" + "umaal r5, r9, r6, r7\n\t" + "adcs r5, r5, r5\n\t" + "umaal r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "adcs r9, r9, r9\n\t" + "umaal r9, r5, r7, r7\n\t" + "adcs r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + /* Reduce */ + "mov r6, #37\n\t" + "umaal r7, r0, r7, r6\n\t" + "mov r6, #19\n\t" + "lsl r0, r0, #1\n\t" + "orr r0, r0, r7, lsr #31\n\t" + "mul lr, r0, r6\n\t" + "pop {r0-r1}\n\t" + "mov r6, #38\n\t" + "umaal r0, lr, r12, r6\n\t" + "umaal r1, lr, r11, r6\n\t" + "mov r12, r3\n\t" + "mov r11, r4\n\t" + "pop {r2-r4}\n\t" + "umaal r2, lr, r10, r6\n\t" + "umaal r3, lr, r12, r6\n\t" + "umaal r4, lr, r11, r6\n\t" + "mov r12, r6\n\t" + "pop {r5-r6}\n\t" + "umaal r5, lr, r8, r12\n\t" + "bfc r7, #31, #1\n\t" + "umaal r6, lr, r9, r12\n\t" + "add r7, r7, lr\n\t" + /* Reduce if top bit set */ + "mov r11, #19\n\t" + "and r12, r11, r7, ASR #31\n\t" + "adds r0, r0, r12\n\t" + "adcs r1, r1, #0\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "bfc r7, #31, #1\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + /* Double */ + "adds r0, r0, r0\n\t" + "adcs r1, r1, r1\n\t" + "adcs r2, r2, r2\n\t" + "adcs r3, r3, r3\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adc r7, r7, r7\n\t" + /* Reduce if top bit set */ + "mov r11, #19\n\t" + "and r12, r11, r7, ASR #31\n\t" + "adds r0, r0, r12\n\t" + "adcs r1, r1, #0\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "bfc r7, #31, #1\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + "pop {r12, lr}\n\t" + /* Store */ + "stm r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + "mov r0, r12\n\t" + "mov r1, lr\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr" + ); +} + +void fe_pow22523(fe r_p, const fe a_p) +{ + register fe r asm ("r0") = r_p; + register const fe a asm ("r1") = a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x68\n\t" + /* pow22523 */ + "str %[r], [sp, #96]\n\t" + "str %[a], [sp, #100]\n\t" + "ldr r1, [sp, #100]\n\t" + "mov r0, sp\n\t" + "bl fe_sq_op\n\t" + "mov r1, sp\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "add r2, sp, #32\n\t" + "ldr r1, [sp, #100]\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r2, sp, #32\n\t" + "mov r1, sp\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "mov r1, sp\n\t" + "mov r0, sp\n\t" + "bl fe_sq_op\n\t" + "mov r2, sp\n\t" + "add r1, sp, #32\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "mov r1, sp\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "mov r12, #4\n\t" + "\n" + "L_fe_pow22523_1_%=: \n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_1_%=\n\t" + "mov r2, sp\n\t" + "add r1, sp, #32\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "mov r1, sp\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "mov r12, #9\n\t" + "\n" + "L_fe_pow22523_2_%=: \n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_2_%=\n\t" + "mov r2, sp\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "mov r12, #19\n\t" + "\n" + "L_fe_pow22523_3_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_3_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "mov r12, #10\n\t" + "\n" + "L_fe_pow22523_4_%=: \n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_4_%=\n\t" + "mov r2, sp\n\t" + "add r1, sp, #32\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "mov r1, sp\n\t" + "add r0, sp, #32\n\t" + "bl fe_sq_op\n\t" + "mov r12, #49\n\t" + "\n" + "L_fe_pow22523_5_%=: \n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_5_%=\n\t" + "mov r2, sp\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #0x40\n\t" + "bl fe_sq_op\n\t" + "mov r12, #0x63\n\t" + "\n" + "L_fe_pow22523_6_%=: \n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #0x40\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_6_%=\n\t" + "add r2, sp, #32\n\t" + "add r1, sp, #0x40\n\t" + "add r0, sp, #32\n\t" + "bl fe_mul_op\n\t" + "mov r12, #50\n\t" + "\n" + "L_fe_pow22523_7_%=: \n\t" + "add r1, sp, #32\n\t" + "add r0, sp, #32\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_7_%=\n\t" + "mov r2, sp\n\t" + "add r1, sp, #32\n\t" + "mov r0, sp\n\t" + "bl fe_mul_op\n\t" + "mov r12, #2\n\t" + "\n" + "L_fe_pow22523_8_%=: \n\t" + "mov r1, sp\n\t" + "mov r0, sp\n\t" + "push {r12}\n\t" + "bl fe_sq_op\n\t" + "pop {r12}\n\t" + "subs r12, r12, #1\n\t" + "bne L_fe_pow22523_8_%=\n\t" + "ldr r2, [sp, #100]\n\t" + "mov r1, sp\n\t" + "ldr r0, [sp, #96]\n\t" + "bl fe_mul_op\n\t" + "ldr %[a], [sp, #100]\n\t" + "ldr %[r], [sp, #96]\n\t" + "add sp, sp, #0x68\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + : "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); - (void)px_p; - (void)py_p; - (void)pz_p; - (void)pt_p; - (void)qz_p; - (void)qt2d_p; - (void)qyplusx_p; - (void)qyminusx_p; } -void fe_ge_sub(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, const fe pz_p, const fe pt_p, const fe qz_p, const fe qt2d_p, const fe qyplusx_p, const fe qyminusx_p) +void ge_p1p1_to_p2(ge_p2 * r_p, const ge_p1p1 * p_p) { - register fe rx asm ("r0") = rx_p; - register fe ry asm ("r1") = ry_p; - register fe rz asm ("r2") = rz_p; - register fe rt asm ("r3") = rt_p; - register const fe px asm ("r4") = px_p; - register const fe py asm ("r5") = py_p; - register const fe pz asm ("r6") = pz_p; - register const fe pt asm ("r7") = pt_p; - register const fe qz asm ("r8") = qz_p; - register const fe qt2d asm ("r9") = qt2d_p; - register const fe qyplusx asm ("r10") = qyplusx_p; - register const fe qyminusx asm ("r11") = qyminusx_p; + register ge_p2 * r asm ("r0") = r_p; + register const ge_p1p1 * p asm ("r1") = p_p; __asm__ __volatile__ ( - "sub sp, sp, #0x60\n\t" - "str %[rx], [sp]\n\t" - "str %[ry], [sp, #4]\n\t" - "str %[rz], [sp, #8]\n\t" - "str %[rt], [sp, #12]\n\t" + "sub sp, sp, #8\n\t" + "str %[r], [sp]\n\t" + "str %[p], [sp, #4]\n\t" + "add r2, r1, #0x60\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "add r2, r1, #0x40\n\t" + "add r1, r1, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "add r2, r1, #0x60\n\t" + "add r1, r1, #0x40\n\t" + "add r0, r0, #0x40\n\t" + "bl fe_mul_op\n\t" + "add sp, sp, #8\n\t" + : [r] "+r" (r), [p] "+r" (p) + : + : "memory", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void ge_p1p1_to_p3(ge_p3 * r_p, const ge_p1p1 * p_p) +{ + register ge_p3 * r asm ("r0") = r_p; + register const ge_p1p1 * p asm ("r1") = p_p; + + __asm__ __volatile__ ( + "sub sp, sp, #8\n\t" + "str %[r], [sp]\n\t" + "str %[p], [sp, #4]\n\t" + "add r2, r1, #0x60\n\t" + "bl fe_mul_op\n\t" "ldr r0, [sp]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "adds r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "adcs r8, r4, r8\n\t" - "adcs r9, r5, r9\n\t" - "adcs r10, r6, r10\n\t" - "adc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp, #136]\n\t" - "ldr r2, [sp, #132]\n\t" - /* Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2]\n\t" - "ldr r9, [r2, #4]\n\t" -#else - "ldrd r8, r9, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #8]\n\t" - "ldr r11, [r2, #12]\n\t" -#else - "ldrd r10, r11, [r2, #8]\n\t" -#endif - "subs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbcs r11, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #8]\n\t" - "str r11, [r0, #12]\n\t" -#else - "strd r10, r11, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" -#else - "ldrd r4, r5, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #24]\n\t" - "ldr r7, [r1, #28]\n\t" -#else - "ldrd r6, r7, [r1, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r2, #16]\n\t" - "ldr r9, [r2, #20]\n\t" -#else - "ldrd r8, r9, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r2, #24]\n\t" - "ldr r11, [r2, #28]\n\t" -#else - "ldrd r10, r11, [r2, #24]\n\t" -#endif - "sbcs r8, r4, r8\n\t" - "sbcs r9, r5, r9\n\t" - "sbcs r10, r6, r10\n\t" - "sbc r11, r7, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" - "adcs r6, r6, %[rt]\n\t" - "adcs r7, r7, %[rt]\n\t" - "adcs r8, r8, %[rt]\n\t" - "adcs r9, r9, %[rt]\n\t" - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r2, [sp, #160]\n\t" - "ldr r1, [sp]\n\t" - "ldr r0, [sp, #8]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #156]\n\t" "ldr r1, [sp, #4]\n\t" - "ldr r0, [sp, #4]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #144]\n\t" - "ldr r1, [sp, #152]\n\t" - "ldr r0, [sp, #12]\n\t" - "bl fe_mul\n\t" - "ldr r2, [sp, #148]\n\t" - "ldr r1, [sp, #140]\n\t" + "add r2, r1, #0x40\n\t" + "add r1, r1, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" "ldr r0, [sp]\n\t" - "bl fe_mul\n\t" - "add r0, sp, #16\n\t" - "ldr r1, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "add r2, r1, #0x60\n\t" + "add r1, r1, #0x40\n\t" + "add r0, r0, #0x40\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "add r2, r1, #32\n\t" + "add r0, r0, #0x60\n\t" + "bl fe_mul_op\n\t" + "add sp, sp, #8\n\t" + : [r] "+r" (r), [p] "+r" (p) + : + : "memory", "lr", "r2", "r3", "r12", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void ge_p2_dbl(ge_p1p1 * r_p, const ge_p2 * p_p) +{ + register ge_p1p1 * r asm ("r0") = r_p; + register const ge_p2 * p asm ("r1") = p_p; + + __asm__ __volatile__ ( + "sub sp, sp, #8\n\t" + "str %[r], [sp]\n\t" + "str %[p], [sp, #4]\n\t" + "bl fe_sq_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "add r1, r1, #32\n\t" + "add r0, r0, #0x40\n\t" + "bl fe_sq_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "add r2, r1, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_add_op\n\t" + "mov r1, r0\n\t" + "add r0, r0, #0x40\n\t" + "bl fe_sq_op\n\t" + "ldr r0, [sp]\n\t" + "mov r3, r0\n\t" + "add r2, r0, #0x40\n\t" + "add r1, r0, #0x40\n\t" + "add r0, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "mov r2, r0\n\t" + "add r1, r0, #0x40\n\t" + "sub r0, r0, #32\n\t" + "bl fe_sub_op\n\t" + "ldr r1, [sp, #4]\n\t" + "add r1, r1, #0x40\n\t" + "add r0, r0, #0x60\n\t" + "bl fe_sq2\n\t" + "sub r2, r0, #32\n\t" + "mov r1, r0\n\t" + "bl fe_sub_op\n\t" + "add sp, sp, #8\n\t" + : [r] "+r" (r), [p] "+r" (p) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_madd(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p) +{ + register ge_p1p1 * r asm ("r0") = r_p; + register const ge_p3 * p asm ("r1") = p_p; + register const ge_precomp * q asm ("r2") = q_p; + + __asm__ __volatile__ ( + "sub sp, sp, #12\n\t" + "str %[r], [sp]\n\t" + "str %[p], [sp, #4]\n\t" + "str %[q], [sp, #8]\n\t" + "mov r2, r1\n\t" + "add r1, r1, #32\n\t" + "bl fe_add_op\n\t" + "ldr r1, [sp, #4]\n\t" + "mov r2, r1\n\t" + "add r1, r1, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_sub_op\n\t" + "ldr r2, [sp, #8]\n\t" + "sub r1, r0, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + "add r2, r2, #32\n\t" + "add r1, r0, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp, #4]\n\t" + "add r2, r2, #0x60\n\t" + "add r1, r1, #0x40\n\t" + "add r0, r0, #0x60\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "add r3, r0, #32\n\t" + "add r2, r0, #0x40\n\t" + "mov r1, r0\n\t" + "add r0, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "ldr r1, [sp, #4]\n\t" + "add r1, r1, #0x40\n\t" + "add r0, r0, #32\n\t" /* Double */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" -#else - "ldrd r4, r5, [r1]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r1, #8]\n\t" - "ldr r7, [r1, #12]\n\t" -#else - "ldrd r6, r7, [r1, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r8, [r1, #16]\n\t" - "ldr r9, [r1, #20]\n\t" -#else - "ldrd r8, r9, [r1, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r10, [r1, #24]\n\t" - "ldr r11, [r1, #28]\n\t" -#else - "ldrd r10, r11, [r1, #24]\n\t" -#endif + "ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "adds r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" @@ -9672,545 +4472,1238 @@ void fe_ge_sub(fe rx_p, fe ry_p, fe rz_p, fe rt_p, const fe px_p, const fe py_p, "adcs r8, r8, r8\n\t" "adcs r9, r9, r9\n\t" "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" - "sbcs r6, r6, %[rt]\n\t" - "sbcs r7, r7, %[rt]\n\t" - "sbcs r8, r8, %[rt]\n\t" - "sbcs r9, r9, %[rt]\n\t" - "sbcs r10, r10, %[rt]\n\t" - "sbc r11, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r6, [r0, #8]\n\t" - "str r7, [r0, #12]\n\t" -#else - "strd r6, r7, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r0, #24]\n\t" - "str r11, [r0, #28]\n\t" -#else - "strd r10, r11, [r0, #24]\n\t" -#endif - "ldr r0, [sp, #4]\n\t" - "ldr r1, [sp]\n\t" + "mov lr, #0\n\t" + "adcs r11, r11, r11\n\t" + "adc lr, lr, #0\n\t" + "mov r12, #19\n\t" + "lsl lr, lr, #1\n\t" + "orr lr, lr, r11, lsr #31\n\t" + "mul r12, lr, r12\n\t" + "adds r4, r4, r12\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "bfc r11, #31, #1\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "add r3, r0, #32\n\t" + "add r1, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "add sp, sp, #12\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_msub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p) +{ + register ge_p1p1 * r asm ("r0") = r_p; + register const ge_p3 * p asm ("r1") = p_p; + register const ge_precomp * q asm ("r2") = q_p; + + __asm__ __volatile__ ( + "sub sp, sp, #12\n\t" + "str %[r], [sp]\n\t" + "str %[p], [sp, #4]\n\t" + "str %[q], [sp, #8]\n\t" + "mov r2, r1\n\t" + "add r1, r1, #32\n\t" + "bl fe_add_op\n\t" + "ldr r1, [sp, #4]\n\t" + "mov r2, r1\n\t" + "add r1, r1, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_sub_op\n\t" "ldr r2, [sp, #8]\n\t" - /* Add-Sub */ - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2]\n\t" - "ldr r5, [r2, #4]\n\t" -#else - "ldrd r4, r5, [r2]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0]\n\t" - "ldr r7, [r0, #4]\n\t" -#else - "ldrd r6, r7, [r0]\n\t" -#endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" -#else - "strd r8, r9, [r0]\n\t" -#endif - /* Sub */ - "subs r10, r4, r6\n\t" + "add r2, r2, #32\n\t" + "sub r1, r0, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + "add r1, r0, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp, #4]\n\t" + "add r2, r2, #0x60\n\t" + "add r1, r1, #0x40\n\t" + "add r0, r0, #0x60\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "add r3, r0, #32\n\t" + "add r2, r0, #0x40\n\t" + "mov r1, r0\n\t" + "add r0, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "ldr r1, [sp, #4]\n\t" + "add r1, r1, #0x40\n\t" + "add r0, r0, #32\n\t" + /* Double */ + "ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" + "adcs r11, r11, r11\n\t" "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #8]\n\t" - "ldr r5, [r2, #12]\n\t" -#else - "ldrd r4, r5, [r2, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" -#else - "strd r8, r9, [r0, #8]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" + "mov r12, #19\n\t" + "lsl lr, lr, #1\n\t" + "orr lr, lr, r11, lsr #31\n\t" + "mul r12, lr, r12\n\t" + "adds r4, r4, r12\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "bfc r11, #31, #1\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "add r3, r0, #32\n\t" + "mov r1, r0\n\t" + "add r0, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "add sp, sp, #12\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_add(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) +{ + register ge_p1p1 * r asm ("r0") = r_p; + register const ge_p3 * p asm ("r1") = p_p; + register const ge_cached* q asm ("r2") = q_p; + + __asm__ __volatile__ ( + "sub sp, sp, #44\n\t" + "str %[r], [sp]\n\t" + "str %[p], [sp, #4]\n\t" + "str %[q], [sp, #8]\n\t" + "mov r3, r1\n\t" + "add r2, r1, #32\n\t" + "add r1, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "ldr r2, [sp, #8]\n\t" + "mov r1, r0\n\t" + "add r0, r0, #0x40\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + "add r2, r2, #32\n\t" + "add r1, r0, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp, #4]\n\t" + "add r2, r2, #0x60\n\t" + "add r1, r1, #0x60\n\t" + "add r0, r0, #0x60\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r2, [sp, #8]\n\t" + "add r2, r2, #0x40\n\t" + "add r1, r1, #0x40\n\t" + "bl fe_mul_op\n\t" + "ldr r1, [sp]\n\t" + "add r0, sp, #12\n\t" + /* Double */ + "ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" + "adcs r11, r11, r11\n\t" "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" -#else - "strd r10, r11, [r1, #8]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #16]\n\t" - "ldr r5, [r2, #20]\n\t" -#else - "ldrd r4, r5, [r2, #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #16]\n\t" - "ldr r7, [r0, #20]\n\t" -#else - "ldrd r6, r7, [r0, #16]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" + "mov r12, #19\n\t" + "lsl lr, lr, #1\n\t" + "orr lr, lr, r11, lsr #31\n\t" + "mul r12, lr, r12\n\t" + "adds r4, r4, r12\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "bfc r11, #31, #1\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "add r3, r1, #32\n\t" + "add r2, r1, #0x40\n\t" + "add r0, r1, #32\n\t" + "bl fe_add_sub_op\n\t" + "add r3, r0, #0x40\n\t" + "add r2, sp, #12\n\t" + "add r1, r0, #0x40\n\t" + "add r0, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "add sp, sp, #44\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) +{ + register ge_p1p1 * r asm ("r0") = r_p; + register const ge_p3 * p asm ("r1") = p_p; + register const ge_cached* q asm ("r2") = q_p; + + __asm__ __volatile__ ( + "sub sp, sp, #44\n\t" + "str %[r], [sp]\n\t" + "str %[p], [sp, #4]\n\t" + "str %[q], [sp, #8]\n\t" + "mov r3, r1\n\t" + "add r2, r1, #32\n\t" + "add r1, r0, #32\n\t" + "bl fe_add_sub_op\n\t" + "ldr r2, [sp, #8]\n\t" + "add r2, r2, #32\n\t" + "mov r1, r0\n\t" + "add r0, r0, #0x40\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r2, [sp, #8]\n\t" + "add r1, r0, #32\n\t" + "add r0, r0, #32\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #8]\n\t" + "ldr r2, [sp, #4]\n\t" + "add r2, r2, #0x60\n\t" + "add r1, r1, #0x60\n\t" + "add r0, r0, #0x60\n\t" + "bl fe_mul_op\n\t" + "ldr r0, [sp]\n\t" + "ldr r1, [sp, #4]\n\t" + "ldr r2, [sp, #8]\n\t" + "add r2, r2, #0x40\n\t" + "add r1, r1, #0x40\n\t" + "bl fe_mul_op\n\t" + "ldr r1, [sp]\n\t" + "add r0, sp, #12\n\t" + /* Double */ + "ldm r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" + "adcs r11, r11, r11\n\t" "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" -#else - "strd r10, r11, [r1, #16]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #24]\n\t" - "ldr r5, [r2, #28]\n\t" -#else - "ldrd r4, r5, [r2, #24]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #24]\n\t" - "ldr r7, [r0, #28]\n\t" -#else - "ldrd r6, r7, [r0, #24]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" -#else - "ldrd r4, r5, [r0]\n\t" -#endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" -#else - "strd r4, r5, [r0]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" -#else - "ldrd r4, r5, [r0, #8]\n\t" -#endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" -#else - "strd r4, r5, [r0, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" + "mov r12, #19\n\t" + "lsl lr, lr, #1\n\t" + "orr lr, lr, r11, lsr #31\n\t" + "mul r12, lr, r12\n\t" + "adds r4, r4, r12\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "bfc r11, #31, #1\n\t" + "adcs r10, r10, #0\n\t" + "adc r11, r11, #0\n\t" + "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "add r3, r1, #32\n\t" + "add r2, r1, #0x40\n\t" + "add r0, r1, #32\n\t" + "bl fe_add_sub_op\n\t" + "add r3, r0, #0x40\n\t" + "add r2, sp, #12\n\t" + "add r1, r0, #32\n\t" + "add r0, r0, #0x40\n\t" + "bl fe_add_sub_op\n\t" + "add sp, sp, #44\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void sc_reduce(byte* s_p) +{ + register byte* s asm ("r0") = s_p; + + __asm__ __volatile__ ( + "sub sp, sp, #52\n\t" + /* Load bits 252-511 */ + "add %[s], %[s], #28\n\t" + "ldm %[s], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "lsr lr, r9, #24\n\t" + "lsl r9, r9, #4\n\t" + "orr r9, r9, r8, lsr #28\n\t" + "lsl r8, r8, #4\n\t" + "orr r8, r8, r7, lsr #28\n\t" + "lsl r7, r7, #4\n\t" + "orr r7, r7, r6, lsr #28\n\t" + "lsl r6, r6, #4\n\t" + "orr r6, r6, r5, lsr #28\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, lsr #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, r3, lsr #28\n\t" + "lsl r3, r3, #4\n\t" + "orr r3, r3, r2, lsr #28\n\t" + "lsl r2, r2, #4\n\t" + "orr r2, r2, r1, lsr #28\n\t" + "bfc r9, #28, #4\n\t" + "sub %[s], %[s], #28\n\t" + /* Add order times bits 504..511 */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r10, #0x2c\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x13\n\t" +#else + "mov r10, #0x2c13\n\t" +#endif + "movt r10, #0xa30a\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0xe5\n\t" #else - "ldrd r4, r5, [r0, #16]\n\t" + "mov r11, #0x9ce5\n\t" #endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" + "movt r11, #0xa7ed\n\t" + "mov r1, #0\n\t" + "umlal r2, r1, r10, lr\n\t" + "umaal r3, r1, r11, lr\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" + "mov r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x29\n\t" #else - "strd r4, r5, [r0, #16]\n\t" + "mov r10, #0x6329\n\t" #endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" + "movt r10, #0x5d08\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" + "mov r11, #0x6\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x21\n\t" #else - "strd r8, r9, [r0, #24]\n\t" + "mov r11, #0x621\n\t" #endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ + "movt r11, #0xeb21\n\t" + "umaal r4, r1, r10, lr\n\t" + "umaal r5, r1, r11, lr\n\t" + "adds r6, r6, r1\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "subs r6, r6, lr\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbc r9, r9, #0\n\t" + /* Sub product of top 8 words and order */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" + "mov r1, #0x2c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x13\n\t" #else - "ldrd r4, r5, [r1]\n\t" + "mov r1, #0x2c13\n\t" #endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" + "movt r1, #0xa30a\n\t" + "mov lr, #0\n\t" + "ldm %[s]!, {r10, r11, r12}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "umaal r12, lr, r4, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm %[s]!, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, r1\n\t" + "umaal r11, lr, r6, r1\n\t" + "umaal r12, lr, r7, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" + "bfc r11, #28, #4\n\t" + "umaal r11, lr, r9, r1\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub %[s], %[s], #16\n\t" + "sub sp, sp, #32\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" + "mov r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0xe5\n\t" #else - "strd r4, r5, [r1]\n\t" + "mov r1, #0x9ce5\n\t" #endif + "movt r1, #0xa7ed\n\t" + "mov lr, #0\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "umaal r12, lr, r4, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, r1\n\t" + "umaal r11, lr, r6, r1\n\t" + "umaal r12, lr, r7, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" + "umaal r11, lr, r9, r1\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub sp, sp, #32\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" + "mov r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x29\n\t" #else - "ldrd r4, r5, [r1, #8]\n\t" + "mov r1, #0x6329\n\t" #endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" + "movt r1, #0x5d08\n\t" + "mov lr, #0\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "umaal r12, lr, r4, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, r1\n\t" + "umaal r11, lr, r6, r1\n\t" + "umaal r12, lr, r7, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" + "umaal r11, lr, r9, r1\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub sp, sp, #32\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" + "mov r1, #0x6\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x21\n\t" #else - "strd r4, r5, [r1, #8]\n\t" + "mov r1, #0x621\n\t" #endif + "movt r1, #0xeb21\n\t" + "mov lr, #0\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "umaal r12, lr, r4, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, r1\n\t" + "umaal r11, lr, r6, r1\n\t" + "umaal r12, lr, r7, r1\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" + "umaal r11, lr, r9, r1\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub sp, sp, #32\n\t" + /* Subtract at 4 * 32 */ + "ldm sp, {r10, r11, r12}\n\t" + "subs r10, r10, r2\n\t" + "sbcs r11, r11, r3\n\t" + "sbcs r12, r12, r4\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "sbcs r10, r10, r5\n\t" + "sbcs r11, r11, r6\n\t" + "sbcs r12, r12, r7\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "sbcs r10, r10, r8\n\t" + "sbc r11, r11, r9\n\t" + "stm sp!, {r10, r11}\n\t" + "sub sp, sp, #36\n\t" + "asr lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" + "mov r1, #0xa00000\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x0\n\t" #else - "ldrd r4, r5, [r1, #16]\n\t" + "mov r1, #0xa0000000\n\t" #endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" + "mov r2, #0xba\n\t" + "lsl r2, r2, #8\n\t" + "add r2, r2, #0x7d\n\t" #else - "strd r4, r5, [r1, #16]\n\t" + "mov r2, #0xba7d\n\t" #endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" + "movt r2, #0x4b9e\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" + "mov r3, #0x4c\n\t" + "lsl r3, r3, #8\n\t" + "add r3, r3, #0x63\n\t" #else - "strd r10, r11, [r1, #24]\n\t" + "mov r3, #0x4c63\n\t" #endif - "ldr r0, [sp, #12]\n\t" - "ldr r1, [sp, #8]\n\t" - "add r2, sp, #16\n\t" - /* Add-Sub */ - /* Add */ + "movt r3, #0xcb02\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2]\n\t" - "ldr r5, [r2, #4]\n\t" + "mov r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "add r4, r4, #0x9a\n\t" #else - "ldrd r4, r5, [r2]\n\t" + "mov r4, #0xf39a\n\t" #endif + "movt r4, #0xd45e\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0]\n\t" - "ldr r7, [r0, #4]\n\t" + "mov r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "add r5, r5, #0x3b\n\t" #else - "ldrd r6, r7, [r0]\n\t" + "mov r5, #0xdf3b\n\t" #endif - "adds r8, r4, r6\n\t" - "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" + "movt r5, #0x29b\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0]\n\t" - "str r9, [r0, #4]\n\t" + "mov r9, #0x20000\n\t" + "lsl r9, r9, #8\n\t" + "add r9, r9, #0x0\n\t" #else - "strd r8, r9, [r0]\n\t" + "mov r9, #0x2000000\n\t" #endif - /* Sub */ - "subs r10, r4, r6\n\t" + "and r1, r1, lr\n\t" + "and r2, r2, lr\n\t" + "and r3, r3, lr\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r9, r9, lr\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "adds r10, r10, r1\n\t" + "adcs r11, r11, r2\n\t" + "adcs r12, r12, r3\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "adcs r10, r10, r4\n\t" + "adcs r11, r11, r5\n\t" + "adcs r12, r12, #0\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adcs r12, r12, r9\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "sub sp, sp, #48\n\t" + "sub %[s], %[s], #16\n\t" + /* Load bits 252-376 */ + "add sp, sp, #28\n\t" + "ldm sp, {r1, r2, r3, r4, r5}\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, lsr #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, r3, lsr #28\n\t" + "lsl r3, r3, #4\n\t" + "orr r3, r3, r2, lsr #28\n\t" + "lsl r2, r2, #4\n\t" + "orr r2, r2, r1, lsr #28\n\t" + "bfc r5, #29, #3\n\t" + "sub sp, sp, #28\n\t" + /* Sub product of top 8 words and order */ + /* * -5cf5d3ed */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r1, #0x2c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x13\n\t" +#else + "mov r1, #0x2c13\n\t" +#endif + "movt r1, #0xa30a\n\t" "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1]\n\t" - "str r11, [r1, #4]\n\t" -#else - "strd r10, r11, [r1]\n\t" -#endif - /* Add */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #8]\n\t" - "ldr r5, [r2, #12]\n\t" -#else - "ldrd r4, r5, [r2, #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #8]\n\t" - "ldr r7, [r0, #12]\n\t" -#else - "ldrd r6, r7, [r0, #8]\n\t" -#endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, lr, r2, r1\n\t" + "umaal r7, lr, r3, r1\n\t" + "umaal r8, lr, r4, r1\n\t" + "umaal r9, lr, r5, r1\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* * -5812631b */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0xe5\n\t" +#else + "mov r1, #0x9ce5\n\t" +#endif + "movt r1, #0xa7ed\n\t" + "mov r10, #0\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, r10, r2, r1\n\t" + "umaal r7, r10, r3, r1\n\t" + "umaal r8, r10, r4, r1\n\t" + "umaal r9, r10, r5, r1\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* * -a2f79cd7 */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x29\n\t" +#else + "mov r1, #0x6329\n\t" +#endif + "movt r1, #0x5d08\n\t" + "mov r11, #0\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, r11, r2, r1\n\t" + "umaal r7, r11, r3, r1\n\t" + "umaal r8, r11, r4, r1\n\t" + "umaal r9, r11, r5, r1\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* * -14def9df */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r1, #0x6\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x21\n\t" +#else + "mov r1, #0x621\n\t" +#endif + "movt r1, #0xeb21\n\t" "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, r12, r2, r1\n\t" + "umaal r7, r12, r3, r1\n\t" + "umaal r8, r12, r4, r1\n\t" + "umaal r9, r12, r5, r1\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* Add overflows at 4 * 32 */ + "ldm sp, {r6, r7, r8, r9}\n\t" + "bfc r9, #28, #4\n\t" + "adds r6, r6, lr\n\t" + "adcs r7, r7, r10\n\t" + "adcs r8, r8, r11\n\t" + "adc r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "subs r6, r6, r2\n\t" + "sbcs r7, r7, r3\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r5\n\t" + "sbc r1, r1, r1\n\t" + "sub sp, sp, #16\n\t" + "ldm sp, {r2, r3, r4, r5}\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #8]\n\t" - "str r9, [r0, #12]\n\t" + "mov r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0xed\n\t" #else - "strd r8, r9, [r0, #8]\n\t" + "mov r10, #0xd3ed\n\t" #endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" + "movt r10, #0x5cf5\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #8]\n\t" - "str r11, [r1, #12]\n\t" + "mov r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x1a\n\t" #else - "strd r10, r11, [r1, #8]\n\t" + "mov r11, #0x631a\n\t" #endif - /* Add */ + "movt r11, #0x5812\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #16]\n\t" - "ldr r5, [r2, #20]\n\t" + "mov r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "add r12, r12, #0xd6\n\t" #else - "ldrd r4, r5, [r2, #16]\n\t" + "mov r12, #0x9cd6\n\t" #endif + "movt r12, #0xa2f7\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #16]\n\t" - "ldr r7, [r0, #20]\n\t" + "mov lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "add lr, lr, #0xde\n\t" #else - "ldrd r6, r7, [r0, #16]\n\t" + "mov lr, #0xf9de\n\t" #endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" + "movt lr, #0x14de\n\t" + "and r10, r10, r1\n\t" + "and r11, r11, r1\n\t" + "and r12, r12, r1\n\t" + "and lr, lr, r1\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "and r1, r1, #0x10000000\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, r1\n\t" + "bfc r9, #28, #4\n\t" + /* Store result */ + "stm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "add sp, sp, #52\n\t" + : [s] "+r" (s) + : + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) +{ + register byte* s asm ("r0") = s_p; + register const byte* a asm ("r1") = a_p; + register const byte* b asm ("r2") = b_p; + register const byte* c asm ("r3") = c_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x50\n\t" + "add lr, sp, #0x44\n\t" + "stm lr, {%[s], %[a], %[c]}\n\t" + "mov lr, %[b]\n\t" + "ldm %[a], {%[s], %[a], %[b], %[c]}\n\t" + "ldm lr!, {r4, r5, r6}\n\t" + "umull r10, r11, %[s], r4\n\t" + "umull r12, r7, %[a], r4\n\t" + "umaal r11, r12, %[s], r5\n\t" + "umull r8, r9, %[b], r4\n\t" + "umaal r12, r8, %[a], r5\n\t" + "umaal r12, r7, %[s], r6\n\t" + "umaal r8, r9, %[c], r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + "umaal r7, r8, %[b], r5\n\t" + "ldm lr!, {r4}\n\t" + "umull r10, r11, %[a], r6\n\t" + "umaal r8, r9, %[b], r6\n\t" + "umaal r7, r10, %[s], r4\n\t" + "umaal r8, r11, %[c], r5\n\t" + "str r7, [sp, #12]\n\t" + "umaal r8, r10, %[a], r4\n\t" + "umaal r9, r11, %[c], r6\n\t" + "umaal r9, r10, %[b], r4\n\t" + "umaal r10, r11, %[c], r4\n\t" + "ldm lr, {r4, r5, r6, r7}\n\t" "mov r12, #0\n\t" - "adcs r9, r5, r7\n\t" - "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #16]\n\t" - "str r9, [r0, #20]\n\t" -#else - "strd r8, r9, [r0, #16]\n\t" -#endif - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "mov lr, #0\n\t" - "sbcs r11, r5, r7\n\t" - "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #16]\n\t" - "str r11, [r1, #20]\n\t" + "umlal r8, r12, %[s], r4\n\t" + "umaal r9, r12, %[a], r4\n\t" + "umaal r10, r12, %[b], r4\n\t" + "umaal r11, r12, %[c], r4\n\t" + "mov r4, #0\n\t" + "umlal r9, r4, %[s], r5\n\t" + "umaal r10, r4, %[a], r5\n\t" + "umaal r11, r4, %[b], r5\n\t" + "umaal r12, r4, %[c], r5\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, %[s], r6\n\t" + "umaal r11, r5, %[a], r6\n\t" + "umaal r12, r5, %[b], r6\n\t" + "umaal r4, r5, %[c], r6\n\t" + "mov r6, #0\n\t" + "umlal r11, r6, %[s], r7\n\t" + "ldr %[s], [sp, #72]\n\t" + "umaal r12, r6, %[a], r7\n\t" + "add %[s], %[s], #16\n\t" + "umaal r4, r6, %[b], r7\n\t" + "sub lr, lr, #16\n\t" + "umaal r5, r6, %[c], r7\n\t" + "ldm %[s], {%[s], %[a], %[b], %[c]}\n\t" + "str r6, [sp, #64]\n\t" + "ldm lr!, {r6}\n\t" + "mov r7, #0\n\t" + "umlal r8, r7, %[s], r6\n\t" + "umaal r9, r7, %[a], r6\n\t" + "str r8, [sp, #16]\n\t" + "umaal r10, r7, %[b], r6\n\t" + "umaal r11, r7, %[c], r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r8, #0\n\t" + "umlal r9, r8, %[s], r6\n\t" + "umaal r10, r8, %[a], r6\n\t" + "str r9, [sp, #20]\n\t" + "umaal r11, r8, %[b], r6\n\t" + "umaal r12, r8, %[c], r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r9, #0\n\t" + "umlal r10, r9, %[s], r6\n\t" + "umaal r11, r9, %[a], r6\n\t" + "str r10, [sp, #24]\n\t" + "umaal r12, r9, %[b], r6\n\t" + "umaal r4, r9, %[c], r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r10, #0\n\t" + "umlal r11, r10, %[s], r6\n\t" + "umaal r12, r10, %[a], r6\n\t" + "str r11, [sp, #28]\n\t" + "umaal r4, r10, %[b], r6\n\t" + "umaal r5, r10, %[c], r6\n\t" + "ldm lr!, {r11}\n\t" + "umaal r12, r7, %[s], r11\n\t" + "umaal r4, r7, %[a], r11\n\t" + "ldr r6, [sp, #64]\n\t" + "umaal r5, r7, %[b], r11\n\t" + "umaal r6, r7, %[c], r11\n\t" + "ldm lr!, {r11}\n\t" + "umaal r4, r8, %[s], r11\n\t" + "umaal r5, r8, %[a], r11\n\t" + "umaal r6, r8, %[b], r11\n\t" + "umaal r7, r8, %[c], r11\n\t" + "ldm lr, {r11, lr}\n\t" + "umaal r5, r9, %[s], r11\n\t" + "umaal r6, r10, %[s], lr\n\t" + "umaal r6, r9, %[a], r11\n\t" + "umaal r7, r10, %[a], lr\n\t" + "umaal r7, r9, %[b], r11\n\t" + "umaal r8, r10, %[b], lr\n\t" + "umaal r8, r9, %[c], r11\n\t" + "umaal r9, r10, %[c], lr\n\t" + "mov %[c], r12\n\t" + "add lr, sp, #32\n\t" + "stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldr %[s], [sp, #68]\n\t" + /* Add c to a * b */ + "ldr lr, [sp, #76]\n\t" + "ldm sp!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ldm lr!, {%[a], r10, r11, r12}\n\t" + "adds %[b], %[b], %[a]\n\t" + "adcs %[c], %[c], r10\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r12\n\t" + "ldm lr!, {%[a], r10, r11, r12}\n\t" + "adcs r6, r6, %[a]\n\t" + "adcs r7, r7, r10\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r12\n\t" + "mov %[a], r9\n\t" + "stm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ldm sp, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "adcs %[b], %[b], #0\n\t" + "adcs %[c], %[c], #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "sub sp, sp, #32\n\t" + /* Get 252..503 and 504..507 */ + "lsr lr, r9, #24\n\t" + "bfc r9, #24, #8\n\t" + "lsl r9, r9, #4\n\t" + "orr r9, r9, r8, lsr #28\n\t" + "lsl r8, r8, #4\n\t" + "orr r8, r8, r7, lsr #28\n\t" + "lsl r7, r7, #4\n\t" + "orr r7, r7, r6, lsr #28\n\t" + "lsl r6, r6, #4\n\t" + "orr r6, r6, r5, lsr #28\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, lsr #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, %[c], lsr #28\n\t" + "lsl %[c], %[c], #4\n\t" + "orr %[c], %[c], %[b], lsr #28\n\t" + "lsl %[b], %[b], #4\n\t" + "orr %[b], %[b], %[a], lsr #28\n\t" + /* Add order times bits 504..507 */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r10, #0x2c\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x13\n\t" +#else + "mov r10, #0x2c13\n\t" +#endif + "movt r10, #0xa30a\n\t" +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0xe5\n\t" #else - "strd r10, r11, [r1, #16]\n\t" + "mov r11, #0x9ce5\n\t" #endif - /* Add */ + "movt r11, #0xa7ed\n\t" + "mov %[a], #0\n\t" + "umlal %[b], %[a], r10, lr\n\t" + "umaal %[c], %[a], r11, lr\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r2, #24]\n\t" - "ldr r5, [r2, #28]\n\t" + "mov r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x29\n\t" #else - "ldrd r4, r5, [r2, #24]\n\t" + "mov r10, #0x6329\n\t" #endif + "movt r10, #0x5d08\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r6, [r0, #24]\n\t" - "ldr r7, [r0, #28]\n\t" + "mov r11, #0x6\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x21\n\t" #else - "ldrd r6, r7, [r0, #24]\n\t" + "mov r11, #0x621\n\t" #endif - "adds r12, r12, #-1\n\t" - "adcs r8, r4, r6\n\t" - "adc r9, r5, r7\n\t" - /* Sub */ - "adds lr, lr, #-1\n\t" - "sbcs r10, r4, r6\n\t" - "sbc r11, r5, r7\n\t" - "mov r12, #-19\n\t" - "asr %[rt], r9, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Sub modulus (if overflow) */ + "movt r11, #0xeb21\n\t" + "umaal r4, %[a], r10, lr\n\t" + "umaal r5, %[a], r11, lr\n\t" + "adds r6, r6, %[a]\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "subs r6, r6, lr\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbc r9, r9, #0\n\t" + /* Sub product of top 8 words and order */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0]\n\t" - "ldr r5, [r0, #4]\n\t" + "mov %[a], #0x2c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x13\n\t" #else - "ldrd r4, r5, [r0]\n\t" + "mov %[a], #0x2c13\n\t" #endif - "subs r4, r4, r12\n\t" - "sbcs r5, r5, %[rt]\n\t" + "movt %[a], #0xa30a\n\t" + "mov lr, #0\n\t" + "ldm %[s]!, {r10, r11, r12}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "umaal r11, lr, %[c], %[a]\n\t" + "umaal r12, lr, r4, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm %[s]!, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, %[a]\n\t" + "umaal r11, lr, r6, %[a]\n\t" + "umaal r12, lr, r7, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umaal r10, lr, r8, %[a]\n\t" + "bfc r11, #28, #4\n\t" + "umaal r11, lr, r9, %[a]\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub %[s], %[s], #16\n\t" + "sub sp, sp, #32\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0]\n\t" - "str r5, [r0, #4]\n\t" + "mov %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0xe5\n\t" #else - "strd r4, r5, [r0]\n\t" + "mov %[a], #0x9ce5\n\t" #endif + "movt %[a], #0xa7ed\n\t" + "mov lr, #0\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "umaal r11, lr, %[c], %[a]\n\t" + "umaal r12, lr, r4, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, %[a]\n\t" + "umaal r11, lr, r6, %[a]\n\t" + "umaal r12, lr, r7, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "umaal r10, lr, r8, %[a]\n\t" + "umaal r11, lr, r9, %[a]\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub sp, sp, #32\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #8]\n\t" - "ldr r5, [r0, #12]\n\t" + "mov %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x29\n\t" #else - "ldrd r4, r5, [r0, #8]\n\t" + "mov %[a], #0x6329\n\t" #endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" + "movt %[a], #0x5d08\n\t" + "mov lr, #0\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "umaal r11, lr, %[c], %[a]\n\t" + "umaal r12, lr, r4, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, %[a]\n\t" + "umaal r11, lr, r6, %[a]\n\t" + "umaal r12, lr, r7, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "umaal r10, lr, r8, %[a]\n\t" + "umaal r11, lr, r9, %[a]\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub sp, sp, #32\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #8]\n\t" - "str r5, [r0, #12]\n\t" + "mov %[a], #0x6\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x21\n\t" #else - "strd r4, r5, [r0, #8]\n\t" + "mov %[a], #0x621\n\t" #endif + "movt %[a], #0xeb21\n\t" + "mov lr, #0\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "umaal r11, lr, %[c], %[a]\n\t" + "umaal r12, lr, r4, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "umaal r10, lr, r5, %[a]\n\t" + "umaal r11, lr, r6, %[a]\n\t" + "umaal r12, lr, r7, %[a]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "umaal r10, lr, r8, %[a]\n\t" + "umaal r11, lr, r9, %[a]\n\t" + "stm sp!, {r10, r11, lr}\n\t" + "sub sp, sp, #32\n\t" + /* Subtract at 4 * 32 */ + "ldm sp, {r10, r11, r12}\n\t" + "subs r10, r10, %[b]\n\t" + "sbcs r11, r11, %[c]\n\t" + "sbcs r12, r12, r4\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "sbcs r10, r10, r5\n\t" + "sbcs r11, r11, r6\n\t" + "sbcs r12, r12, r7\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11}\n\t" + "sbcs r10, r10, r8\n\t" + "sbc r11, r11, r9\n\t" + "stm sp!, {r10, r11}\n\t" + "sub sp, sp, #36\n\t" + "asr lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r0, #16]\n\t" - "ldr r5, [r0, #20]\n\t" + "mov %[a], #0xa00000\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x0\n\t" #else - "ldrd r4, r5, [r0, #16]\n\t" + "mov %[a], #0xa0000000\n\t" #endif - "sbcs r4, r4, %[rt]\n\t" - "sbcs r5, r5, %[rt]\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r0, #16]\n\t" - "str r5, [r0, #20]\n\t" + "mov %[b], #0xba\n\t" + "lsl %[b], %[b], #8\n\t" + "add %[b], %[b], #0x7d\n\t" #else - "strd r4, r5, [r0, #16]\n\t" + "mov %[b], #0xba7d\n\t" #endif - "sbcs r8, r8, %[rt]\n\t" - "sbc r9, r9, lr\n\t" + "movt %[b], #0x4b9e\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [r0, #24]\n\t" - "str r9, [r0, #28]\n\t" + "mov %[c], #0x4c\n\t" + "lsl %[c], %[c], #8\n\t" + "add %[c], %[c], #0x63\n\t" #else - "strd r8, r9, [r0, #24]\n\t" + "mov %[c], #0x4c63\n\t" #endif - "mov r12, #-19\n\t" - "asr %[rt], r11, #31\n\t" - /* Mask the modulus */ - "and r12, %[rt], r12\n\t" - "and lr, %[rt], #0x7fffffff\n\t" - /* Add modulus (if underflow) */ + "movt %[c], #0xcb02\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1]\n\t" - "ldr r5, [r1, #4]\n\t" + "mov r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "add r4, r4, #0x9a\n\t" #else - "ldrd r4, r5, [r1]\n\t" + "mov r4, #0xf39a\n\t" #endif - "adds r4, r4, r12\n\t" - "adcs r5, r5, %[rt]\n\t" + "movt r4, #0xd45e\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1]\n\t" - "str r5, [r1, #4]\n\t" + "mov r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "add r5, r5, #0x3b\n\t" #else - "strd r4, r5, [r1]\n\t" + "mov r5, #0xdf3b\n\t" #endif + "movt r5, #0x29b\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #8]\n\t" - "ldr r5, [r1, #12]\n\t" + "mov r9, #0x20000\n\t" + "lsl r9, r9, #8\n\t" + "add r9, r9, #0x0\n\t" #else - "ldrd r4, r5, [r1, #8]\n\t" + "mov r9, #0x2000000\n\t" #endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" + "and %[a], %[a], lr\n\t" + "and %[b], %[b], lr\n\t" + "and %[c], %[c], lr\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r9, r9, lr\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "adds r10, r10, %[a]\n\t" + "adcs r11, r11, %[b]\n\t" + "adcs r12, r12, %[c]\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "adcs r10, r10, r4\n\t" + "adcs r11, r11, r5\n\t" + "adcs r12, r12, #0\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "ldm sp, {r10, r11, r12}\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "adcs r12, r12, r9\n\t" + "stm sp!, {r10, r11, r12}\n\t" + "sub sp, sp, #48\n\t" + "sub %[s], %[s], #16\n\t" + /* Load bits 252-376 */ + "add sp, sp, #28\n\t" + "ldm sp, {%[a], %[b], %[c], r4, r5}\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, lsr #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, %[c], lsr #28\n\t" + "lsl %[c], %[c], #4\n\t" + "orr %[c], %[c], %[b], lsr #28\n\t" + "lsl %[b], %[b], #4\n\t" + "orr %[b], %[b], %[a], lsr #28\n\t" + "bfc r5, #29, #3\n\t" + "sub sp, sp, #28\n\t" + /* Sub product of top 8 words and order */ + /* * -5cf5d3ed */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov %[a], #0x2c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x13\n\t" +#else + "mov %[a], #0x2c13\n\t" +#endif + "movt %[a], #0xa30a\n\t" + "mov lr, #0\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, lr, %[b], %[a]\n\t" + "umaal r7, lr, %[c], %[a]\n\t" + "umaal r8, lr, r4, %[a]\n\t" + "umaal r9, lr, r5, %[a]\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* * -5812631b */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0xe5\n\t" +#else + "mov %[a], #0x9ce5\n\t" +#endif + "movt %[a], #0xa7ed\n\t" + "mov r10, #0\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, r10, %[b], %[a]\n\t" + "umaal r7, r10, %[c], %[a]\n\t" + "umaal r8, r10, r4, %[a]\n\t" + "umaal r9, r10, r5, %[a]\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* * -a2f79cd7 */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x29\n\t" +#else + "mov %[a], #0x6329\n\t" +#endif + "movt %[a], #0x5d08\n\t" + "mov r11, #0\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, r11, %[b], %[a]\n\t" + "umaal r7, r11, %[c], %[a]\n\t" + "umaal r8, r11, r4, %[a]\n\t" + "umaal r9, r11, r5, %[a]\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* * -14def9df */ +#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov %[a], #0x6\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x21\n\t" +#else + "mov %[a], #0x621\n\t" +#endif + "movt %[a], #0xeb21\n\t" + "mov r12, #0\n\t" + "ldm sp, {r6, r7, r8, r9}\n\t" + "umlal r6, r12, %[b], %[a]\n\t" + "umaal r7, r12, %[c], %[a]\n\t" + "umaal r8, r12, r4, %[a]\n\t" + "umaal r9, r12, r5, %[a]\n\t" + "stm sp, {r6, r7, r8, r9}\n\t" + "add sp, sp, #4\n\t" + /* Add overflows at 4 * 32 */ + "ldm sp, {r6, r7, r8, r9}\n\t" + "bfc r9, #28, #4\n\t" + "adds r6, r6, lr\n\t" + "adcs r7, r7, r10\n\t" + "adcs r8, r8, r11\n\t" + "adc r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "subs r6, r6, %[b]\n\t" + "sbcs r7, r7, %[c]\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r5\n\t" + "sbc %[a], %[a], %[a]\n\t" + "sub sp, sp, #16\n\t" + "ldm sp, {%[b], %[c], r4, r5}\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #8]\n\t" - "str r5, [r1, #12]\n\t" + "mov r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0xed\n\t" #else - "strd r4, r5, [r1, #8]\n\t" + "mov r10, #0xd3ed\n\t" #endif + "movt r10, #0x5cf5\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr r4, [r1, #16]\n\t" - "ldr r5, [r1, #20]\n\t" + "mov r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x1a\n\t" #else - "ldrd r4, r5, [r1, #16]\n\t" + "mov r11, #0x631a\n\t" #endif - "adcs r4, r4, %[rt]\n\t" - "adcs r5, r5, %[rt]\n\t" + "movt r11, #0x5812\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r4, [r1, #16]\n\t" - "str r5, [r1, #20]\n\t" + "mov r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "add r12, r12, #0xd6\n\t" #else - "strd r4, r5, [r1, #16]\n\t" + "mov r12, #0x9cd6\n\t" #endif - "adcs r10, r10, %[rt]\n\t" - "adc r11, r11, lr\n\t" + "movt r12, #0xa2f7\n\t" #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [r1, #24]\n\t" - "str r11, [r1, #28]\n\t" + "mov lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "add lr, lr, #0xde\n\t" #else - "strd r10, r11, [r1, #24]\n\t" + "mov lr, #0xf9de\n\t" #endif - "add sp, sp, #0x60\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt) + "movt lr, #0x14de\n\t" + "and r10, r10, %[a]\n\t" + "and r11, r11, %[a]\n\t" + "and r12, r12, %[a]\n\t" + "and lr, lr, %[a]\n\t" + "adds %[b], %[b], r10\n\t" + "adcs %[c], %[c], r11\n\t" + "adcs r4, r4, r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "and %[a], %[a], #0x10000000\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, %[a]\n\t" + "bfc r9, #28, #4\n\t" + /* Store result */ + "stm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "add sp, sp, #0x50\n\t" + : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) : : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - (void)px_p; - (void)py_p; - (void)pz_p; - (void)pt_p; - (void)qz_p; - (void)qt2d_p; - (void)qyplusx_p; - (void)qyminusx_p; } +#endif /* HAVE_ED25519 */ -#endif /* HAVE_CURVE25519 */ -#endif /* !__aarch64__ */ +#endif /* !CURVE25519_SMALL || !ED25519_SMALL */ +#endif /* HAVE_CURVE25519 || HAVE_ED25519 */ +#endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S index ed3364d436..d2715c6def 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S @@ -30,7 +30,8 @@ #include #ifdef WOLFSSL_ARMASM -#ifndef __aarch64__ +#if !defined(__aarch64__) && defined(__arm__) +#ifndef WOLFSSL_ARMASM_INLINE #ifndef NO_SHA256 #ifdef WOLFSSL_ARMASM_NO_NEON .text @@ -162,30 +163,14 @@ Transform_Sha256_Len: # Start of loop processing a block L_SHA256_transform_len_begin: # Load, Reverse and Store W - 64 bytes -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r4, [r1] ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r6, [r1, #8] ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r8, [r1, #16] ldr r9, [r1, #20] -#else - ldrd r8, r9, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] -#else - ldrd r10, r11, [r1, #24] -#endif rev r4, r4 rev r5, r5 rev r6, r6 @@ -218,30 +203,14 @@ L_SHA256_transform_len_begin: #else strd r10, r11, [sp, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r4, [r1, #32] ldr r5, [r1, #36] -#else - ldrd r4, r5, [r1, #32] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r6, [r1, #40] ldr r7, [r1, #44] -#else - ldrd r6, r7, [r1, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r8, [r1, #48] ldr r9, [r1, #52] -#else - ldrd r8, r9, [r1, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] -#else - ldrd r10, r11, [r1, #56] -#endif rev r4, r4 rev r5, r5 rev r6, r6 @@ -2765,9 +2734,10 @@ L_SHA256_transform_neon_len_start: .size Transform_Sha256_Len,.-Transform_Sha256_Len #endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* !NO_SHA256 */ -#endif /* !__aarch64__ */ +#endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index d81d5bba42..97b53420a7 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -30,84 +30,37 @@ #include #ifdef WOLFSSL_ARMASM -#ifndef __aarch64__ +#if !defined(__aarch64__) && defined(__arm__) #include #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include +#ifdef WOLFSSL_ARMASM_INLINE #ifndef NO_SHA256 #include #ifdef WOLFSSL_ARMASM_NO_NEON static const uint32_t L_SHA256_transform_len_k[] = { - 0x428a2f98, - 0x71374491, - 0xb5c0fbcf, - 0xe9b5dba5, - 0x3956c25b, - 0x59f111f1, - 0x923f82a4, - 0xab1c5ed5, - 0xd807aa98, - 0x12835b01, - 0x243185be, - 0x550c7dc3, - 0x72be5d74, - 0x80deb1fe, - 0x9bdc06a7, - 0xc19bf174, - 0xe49b69c1, - 0xefbe4786, - 0xfc19dc6, - 0x240ca1cc, - 0x2de92c6f, - 0x4a7484aa, - 0x5cb0a9dc, - 0x76f988da, - 0x983e5152, - 0xa831c66d, - 0xb00327c8, - 0xbf597fc7, - 0xc6e00bf3, - 0xd5a79147, - 0x6ca6351, - 0x14292967, - 0x27b70a85, - 0x2e1b2138, - 0x4d2c6dfc, - 0x53380d13, - 0x650a7354, - 0x766a0abb, - 0x81c2c92e, - 0x92722c85, - 0xa2bfe8a1, - 0xa81a664b, - 0xc24b8b70, - 0xc76c51a3, - 0xd192e819, - 0xd6990624, - 0xf40e3585, - 0x106aa070, - 0x19a4c116, - 0x1e376c08, - 0x2748774c, - 0x34b0bcb5, - 0x391c0cb3, - 0x4ed8aa4a, - 0x5b9cca4f, - 0x682e6ff3, - 0x748f82ee, - 0x78a5636f, - 0x84c87814, - 0x8cc70208, - 0x90befffa, - 0xa4506ceb, - 0xbef9a3f7, - 0xc67178f2, + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; -void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p); +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) { register wc_Sha256* sha256 asm ("r0") = sha256_p; @@ -170,30 +123,14 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "\n" "L_SHA256_transform_len_begin_%=: \n\t" /* Load, Reverse and Store W - 64 bytes */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r4, [%[data]]\n\t" "ldr r5, [%[data], #4]\n\t" -#else - "ldrd r4, r5, [%[data]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r6, [%[data], #8]\n\t" "ldr r7, [%[data], #12]\n\t" -#else - "ldrd r6, r7, [%[data], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r8, [%[data], #16]\n\t" "ldr r9, [%[data], #20]\n\t" -#else - "ldrd r8, r9, [%[data], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r10, [%[data], #24]\n\t" "ldr r11, [%[data], #28]\n\t" -#else - "ldrd r10, r11, [%[data], #24]\n\t" -#endif "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" @@ -226,30 +163,14 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #else "strd r10, r11, [sp, #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r4, [%[data], #32]\n\t" "ldr r5, [%[data], #36]\n\t" -#else - "ldrd r4, r5, [%[data], #32]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r6, [%[data], #40]\n\t" "ldr r7, [%[data], #44]\n\t" -#else - "ldrd r6, r7, [%[data], #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r8, [%[data], #48]\n\t" "ldr r9, [%[data], #52]\n\t" -#else - "ldrd r8, r9, [%[data], #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r10, [%[data], #56]\n\t" "ldr r11, [%[data], #60]\n\t" -#else - "ldrd r10, r11, [%[data], #56]\n\t" -#endif "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" @@ -1667,7 +1588,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "bne L_SHA256_transform_len_begin_%=\n\t" "add sp, sp, #0xc0\n\t" : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) - : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k) + : [L_SHA256_transform_len_k] "g" (L_SHA256_transform_len_k) : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } @@ -1677,73 +1598,25 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #ifndef WOLFSSL_ARMASM_NO_NEON static const uint32_t L_SHA256_transform_neon_len_k[] = { - 0x428a2f98, - 0x71374491, - 0xb5c0fbcf, - 0xe9b5dba5, - 0x3956c25b, - 0x59f111f1, - 0x923f82a4, - 0xab1c5ed5, - 0xd807aa98, - 0x12835b01, - 0x243185be, - 0x550c7dc3, - 0x72be5d74, - 0x80deb1fe, - 0x9bdc06a7, - 0xc19bf174, - 0xe49b69c1, - 0xefbe4786, - 0xfc19dc6, - 0x240ca1cc, - 0x2de92c6f, - 0x4a7484aa, - 0x5cb0a9dc, - 0x76f988da, - 0x983e5152, - 0xa831c66d, - 0xb00327c8, - 0xbf597fc7, - 0xc6e00bf3, - 0xd5a79147, - 0x6ca6351, - 0x14292967, - 0x27b70a85, - 0x2e1b2138, - 0x4d2c6dfc, - 0x53380d13, - 0x650a7354, - 0x766a0abb, - 0x81c2c92e, - 0x92722c85, - 0xa2bfe8a1, - 0xa81a664b, - 0xc24b8b70, - 0xc76c51a3, - 0xd192e819, - 0xd6990624, - 0xf40e3585, - 0x106aa070, - 0x19a4c116, - 0x1e376c08, - 0x2748774c, - 0x34b0bcb5, - 0x391c0cb3, - 0x4ed8aa4a, - 0x5b9cca4f, - 0x682e6ff3, - 0x748f82ee, - 0x78a5636f, - 0x84c87814, - 0x8cc70208, - 0x90befffa, - 0xa4506ceb, - 0xbef9a3f7, - 0xc67178f2, + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; -void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p); +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) { register wc_Sha256* sha256 asm ("r0") = sha256_p; @@ -2776,12 +2649,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "bne L_SHA256_transform_neon_len_begin_%=\n\t" "add sp, sp, #24\n\t" : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) - : [L_SHA256_transform_neon_len_k] "r" (L_SHA256_transform_neon_len_k) + : [L_SHA256_transform_neon_len_k] "g" (L_SHA256_transform_neon_len_k) : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11" ); } #endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* !NO_SHA256 */ -#endif /* !__aarch64__ */ +#endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S index 5627688a8a..1c4a7176fd 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -30,7 +30,8 @@ #include #ifdef WOLFSSL_ARMASM -#ifndef __aarch64__ +#if !defined(__aarch64__) && defined(__arm__) +#ifndef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_SHA512 #ifdef WOLFSSL_ARMASM_NO_NEON .text @@ -306,30 +307,14 @@ Transform_Sha512_Len: # Start of loop processing a block L_SHA512_transform_len_begin: # Load, Reverse and Store W -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r4, [r1] ldr r5, [r1, #4] -#else - ldrd r4, r5, [r1] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r6, [r1, #8] ldr r7, [r1, #12] -#else - ldrd r6, r7, [r1, #8] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r8, [r1, #16] ldr r9, [r1, #20] -#else - ldrd r8, r9, [r1, #16] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] -#else - ldrd r10, r11, [r1, #24] -#endif rev r4, r4 rev r5, r5 rev r6, r6 @@ -346,30 +331,14 @@ L_SHA512_transform_len_begin: str r8, [sp, #20] str r11, [sp, #24] str r10, [sp, #28] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r4, [r1, #32] ldr r5, [r1, #36] -#else - ldrd r4, r5, [r1, #32] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r6, [r1, #40] ldr r7, [r1, #44] -#else - ldrd r6, r7, [r1, #40] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r8, [r1, #48] ldr r9, [r1, #52] -#else - ldrd r8, r9, [r1, #48] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] -#else - ldrd r10, r11, [r1, #56] -#endif rev r4, r4 rev r5, r5 rev r6, r6 @@ -386,30 +355,14 @@ L_SHA512_transform_len_begin: str r8, [sp, #52] str r11, [sp, #56] str r10, [sp, #60] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r4, [r1, #64] ldr r5, [r1, #68] -#else - ldrd r4, r5, [r1, #64] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r6, [r1, #72] ldr r7, [r1, #76] -#else - ldrd r6, r7, [r1, #72] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r8, [r1, #80] ldr r9, [r1, #84] -#else - ldrd r8, r9, [r1, #80] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] -#else - ldrd r10, r11, [r1, #88] -#endif rev r4, r4 rev r5, r5 rev r6, r6 @@ -426,30 +379,14 @@ L_SHA512_transform_len_begin: str r8, [sp, #84] str r11, [sp, #88] str r10, [sp, #92] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r4, [r1, #96] ldr r5, [r1, #100] -#else - ldrd r4, r5, [r1, #96] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r6, [r1, #104] ldr r7, [r1, #108] -#else - ldrd r6, r7, [r1, #104] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r8, [r1, #112] ldr r9, [r1, #116] -#else - ldrd r8, r9, [r1, #112] -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) ldr r10, [r1, #120] ldr r11, [r1, #124] -#else - ldrd r10, r11, [r1, #120] -#endif rev r4, r4 rev r5, r5 rev r6, r6 @@ -9233,9 +9170,10 @@ L_SHA512_transform_neon_len_start: .size Transform_Sha512_Len,.-Transform_Sha512_Len #endif /* !WOLFSSL_ARMASM_NO_NEON */ #endif /* WOLFSSL_SHA512 */ -#endif /* !__aarch64__ */ +#endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index 06f2bf4df6..0a513ac044 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -30,100 +30,61 @@ #include #ifdef WOLFSSL_ARMASM -#ifndef __aarch64__ +#if !defined(__aarch64__) && defined(__arm__) #include #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include +#ifdef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_SHA512 #include #ifdef WOLFSSL_ARMASM_NO_NEON static const uint64_t L_SHA512_transform_len_k[] = { - 0x428a2f98d728ae22UL, - 0x7137449123ef65cdUL, - 0xb5c0fbcfec4d3b2fUL, - 0xe9b5dba58189dbbcUL, - 0x3956c25bf348b538UL, - 0x59f111f1b605d019UL, - 0x923f82a4af194f9bUL, - 0xab1c5ed5da6d8118UL, - 0xd807aa98a3030242UL, - 0x12835b0145706fbeUL, - 0x243185be4ee4b28cUL, - 0x550c7dc3d5ffb4e2UL, - 0x72be5d74f27b896fUL, - 0x80deb1fe3b1696b1UL, - 0x9bdc06a725c71235UL, - 0xc19bf174cf692694UL, - 0xe49b69c19ef14ad2UL, - 0xefbe4786384f25e3UL, - 0xfc19dc68b8cd5b5UL, - 0x240ca1cc77ac9c65UL, - 0x2de92c6f592b0275UL, - 0x4a7484aa6ea6e483UL, - 0x5cb0a9dcbd41fbd4UL, - 0x76f988da831153b5UL, - 0x983e5152ee66dfabUL, - 0xa831c66d2db43210UL, - 0xb00327c898fb213fUL, - 0xbf597fc7beef0ee4UL, - 0xc6e00bf33da88fc2UL, - 0xd5a79147930aa725UL, - 0x6ca6351e003826fUL, - 0x142929670a0e6e70UL, - 0x27b70a8546d22ffcUL, - 0x2e1b21385c26c926UL, - 0x4d2c6dfc5ac42aedUL, - 0x53380d139d95b3dfUL, - 0x650a73548baf63deUL, - 0x766a0abb3c77b2a8UL, - 0x81c2c92e47edaee6UL, - 0x92722c851482353bUL, - 0xa2bfe8a14cf10364UL, - 0xa81a664bbc423001UL, - 0xc24b8b70d0f89791UL, - 0xc76c51a30654be30UL, - 0xd192e819d6ef5218UL, - 0xd69906245565a910UL, - 0xf40e35855771202aUL, - 0x106aa07032bbd1b8UL, - 0x19a4c116b8d2d0c8UL, - 0x1e376c085141ab53UL, - 0x2748774cdf8eeb99UL, - 0x34b0bcb5e19b48a8UL, - 0x391c0cb3c5c95a63UL, - 0x4ed8aa4ae3418acbUL, - 0x5b9cca4f7763e373UL, - 0x682e6ff3d6b2b8a3UL, - 0x748f82ee5defb2fcUL, - 0x78a5636f43172f60UL, - 0x84c87814a1f0ab72UL, - 0x8cc702081a6439ecUL, - 0x90befffa23631e28UL, - 0xa4506cebde82bde9UL, - 0xbef9a3f7b2c67915UL, - 0xc67178f2e372532bUL, - 0xca273eceea26619cUL, - 0xd186b8c721c0c207UL, - 0xeada7dd6cde0eb1eUL, - 0xf57d4f7fee6ed178UL, - 0x6f067aa72176fbaUL, - 0xa637dc5a2c898a6UL, - 0x113f9804bef90daeUL, - 0x1b710b35131c471bUL, - 0x28db77f523047d84UL, - 0x32caab7b40c72493UL, - 0x3c9ebe0a15c9bebcUL, - 0x431d67c49c100d4cUL, - 0x4cc5d4becb3e42b6UL, - 0x597f299cfc657e2aUL, - 0x5fcb6fab3ad6faecUL, - 0x6c44198c4a475817UL, + 0x428a2f98d728ae22, 0x7137449123ef65cd, + 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, + 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, + 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, + 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, + 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, + 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, + 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, + 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, + 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, + 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, + 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, + 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, + 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, + 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, + 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, + 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, + 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, + 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, + 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, + 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, }; -void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p); +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) { register wc_Sha512* sha512 asm ("r0") = sha512_p; @@ -234,30 +195,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "\n" "L_SHA512_transform_len_begin_%=: \n\t" /* Load, Reverse and Store W */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r4, [%[data]]\n\t" "ldr r5, [%[data], #4]\n\t" -#else - "ldrd r4, r5, [%[data]]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r6, [%[data], #8]\n\t" "ldr r7, [%[data], #12]\n\t" -#else - "ldrd r6, r7, [%[data], #8]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r8, [%[data], #16]\n\t" "ldr r9, [%[data], #20]\n\t" -#else - "ldrd r8, r9, [%[data], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r10, [%[data], #24]\n\t" "ldr r11, [%[data], #28]\n\t" -#else - "ldrd r10, r11, [%[data], #24]\n\t" -#endif "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" @@ -274,30 +219,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "str r8, [sp, #20]\n\t" "str r11, [sp, #24]\n\t" "str r10, [sp, #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r4, [%[data], #32]\n\t" "ldr r5, [%[data], #36]\n\t" -#else - "ldrd r4, r5, [%[data], #32]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r6, [%[data], #40]\n\t" "ldr r7, [%[data], #44]\n\t" -#else - "ldrd r6, r7, [%[data], #40]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r8, [%[data], #48]\n\t" "ldr r9, [%[data], #52]\n\t" -#else - "ldrd r8, r9, [%[data], #48]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r10, [%[data], #56]\n\t" "ldr r11, [%[data], #60]\n\t" -#else - "ldrd r10, r11, [%[data], #56]\n\t" -#endif "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" @@ -314,30 +243,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "str r8, [sp, #52]\n\t" "str r11, [sp, #56]\n\t" "str r10, [sp, #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r4, [%[data], #64]\n\t" "ldr r5, [%[data], #68]\n\t" -#else - "ldrd r4, r5, [%[data], #64]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r6, [%[data], #72]\n\t" "ldr r7, [%[data], #76]\n\t" -#else - "ldrd r6, r7, [%[data], #72]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r8, [%[data], #80]\n\t" "ldr r9, [%[data], #84]\n\t" -#else - "ldrd r8, r9, [%[data], #80]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r10, [%[data], #88]\n\t" "ldr r11, [%[data], #92]\n\t" -#else - "ldrd r10, r11, [%[data], #88]\n\t" -#endif "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" @@ -354,30 +267,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "str r8, [sp, #84]\n\t" "str r11, [sp, #88]\n\t" "str r10, [sp, #92]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r4, [%[data], #96]\n\t" "ldr r5, [%[data], #100]\n\t" -#else - "ldrd r4, r5, [%[data], #96]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r6, [%[data], #104]\n\t" "ldr r7, [%[data], #108]\n\t" -#else - "ldrd r6, r7, [%[data], #104]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r8, [%[data], #112]\n\t" "ldr r9, [%[data], #116]\n\t" -#else - "ldrd r8, r9, [%[data], #112]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) "ldr r10, [%[data], #120]\n\t" "ldr r11, [%[data], #124]\n\t" -#else - "ldrd r10, r11, [%[data], #120]\n\t" -#endif "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" @@ -7496,7 +7393,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r0, r0, r0\n\t" "add sp, sp, #0xc0\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) - : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) + : [L_SHA512_transform_len_k] "g" (L_SHA512_transform_len_k) : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } @@ -7506,89 +7403,49 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #ifndef WOLFSSL_ARMASM_NO_NEON static const uint64_t L_SHA512_transform_neon_len_k[] = { - 0x428a2f98d728ae22UL, - 0x7137449123ef65cdUL, - 0xb5c0fbcfec4d3b2fUL, - 0xe9b5dba58189dbbcUL, - 0x3956c25bf348b538UL, - 0x59f111f1b605d019UL, - 0x923f82a4af194f9bUL, - 0xab1c5ed5da6d8118UL, - 0xd807aa98a3030242UL, - 0x12835b0145706fbeUL, - 0x243185be4ee4b28cUL, - 0x550c7dc3d5ffb4e2UL, - 0x72be5d74f27b896fUL, - 0x80deb1fe3b1696b1UL, - 0x9bdc06a725c71235UL, - 0xc19bf174cf692694UL, - 0xe49b69c19ef14ad2UL, - 0xefbe4786384f25e3UL, - 0xfc19dc68b8cd5b5UL, - 0x240ca1cc77ac9c65UL, - 0x2de92c6f592b0275UL, - 0x4a7484aa6ea6e483UL, - 0x5cb0a9dcbd41fbd4UL, - 0x76f988da831153b5UL, - 0x983e5152ee66dfabUL, - 0xa831c66d2db43210UL, - 0xb00327c898fb213fUL, - 0xbf597fc7beef0ee4UL, - 0xc6e00bf33da88fc2UL, - 0xd5a79147930aa725UL, - 0x6ca6351e003826fUL, - 0x142929670a0e6e70UL, - 0x27b70a8546d22ffcUL, - 0x2e1b21385c26c926UL, - 0x4d2c6dfc5ac42aedUL, - 0x53380d139d95b3dfUL, - 0x650a73548baf63deUL, - 0x766a0abb3c77b2a8UL, - 0x81c2c92e47edaee6UL, - 0x92722c851482353bUL, - 0xa2bfe8a14cf10364UL, - 0xa81a664bbc423001UL, - 0xc24b8b70d0f89791UL, - 0xc76c51a30654be30UL, - 0xd192e819d6ef5218UL, - 0xd69906245565a910UL, - 0xf40e35855771202aUL, - 0x106aa07032bbd1b8UL, - 0x19a4c116b8d2d0c8UL, - 0x1e376c085141ab53UL, - 0x2748774cdf8eeb99UL, - 0x34b0bcb5e19b48a8UL, - 0x391c0cb3c5c95a63UL, - 0x4ed8aa4ae3418acbUL, - 0x5b9cca4f7763e373UL, - 0x682e6ff3d6b2b8a3UL, - 0x748f82ee5defb2fcUL, - 0x78a5636f43172f60UL, - 0x84c87814a1f0ab72UL, - 0x8cc702081a6439ecUL, - 0x90befffa23631e28UL, - 0xa4506cebde82bde9UL, - 0xbef9a3f7b2c67915UL, - 0xc67178f2e372532bUL, - 0xca273eceea26619cUL, - 0xd186b8c721c0c207UL, - 0xeada7dd6cde0eb1eUL, - 0xf57d4f7fee6ed178UL, - 0x6f067aa72176fbaUL, - 0xa637dc5a2c898a6UL, - 0x113f9804bef90daeUL, - 0x1b710b35131c471bUL, - 0x28db77f523047d84UL, - 0x32caab7b40c72493UL, - 0x3c9ebe0a15c9bebcUL, - 0x431d67c49c100d4cUL, - 0x4cc5d4becb3e42b6UL, - 0x597f299cfc657e2aUL, - 0x5fcb6fab3ad6faecUL, - 0x6c44198c4a475817UL, + 0x428a2f98d728ae22, 0x7137449123ef65cd, + 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, + 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, + 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, + 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, + 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, + 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, + 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, + 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, + 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, + 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, + 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, + 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, + 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, + 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, + 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, + 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, + 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, + 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, + 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, + 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, }; -void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p); +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) { register wc_Sha512* sha512 asm ("r0") = sha512_p; @@ -7639,7 +7496,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "\n" "L_SHA512_transform_neon_len_start_%=: \n\t" /* Round 0 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d4, #50\n\t" "vsri.u64 d8, d4, #14\n\t" "vshl.u64 d9, d0, #36\n\t" @@ -7668,7 +7525,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d3, d7\n\t" "vadd.i64 d7, d10\n\t" /* Round 1 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d3, #50\n\t" "vsri.u64 d8, d3, #14\n\t" "vshl.u64 d9, d7, #36\n\t" @@ -7759,7 +7616,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d17, d11\n\t" #endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 2 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d2, #50\n\t" "vsri.u64 d8, d2, #14\n\t" "vshl.u64 d9, d6, #36\n\t" @@ -7788,7 +7645,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d1, d5\n\t" "vadd.i64 d5, d10\n\t" /* Round 3 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d1, #50\n\t" "vsri.u64 d8, d1, #14\n\t" "vshl.u64 d9, d5, #36\n\t" @@ -7879,7 +7736,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d19, d11\n\t" #endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 4 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d0, #50\n\t" "vsri.u64 d8, d0, #14\n\t" "vshl.u64 d9, d4, #36\n\t" @@ -7908,7 +7765,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d7, d3\n\t" "vadd.i64 d3, d10\n\t" /* Round 5 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d7, #50\n\t" "vsri.u64 d8, d7, #14\n\t" "vshl.u64 d9, d3, #36\n\t" @@ -7999,7 +7856,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d21, d11\n\t" #endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 6 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d6, #50\n\t" "vsri.u64 d8, d6, #14\n\t" "vshl.u64 d9, d2, #36\n\t" @@ -8028,7 +7885,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d5, d1\n\t" "vadd.i64 d1, d10\n\t" /* Round 7 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d5, #50\n\t" "vsri.u64 d8, d5, #14\n\t" "vshl.u64 d9, d1, #36\n\t" @@ -8119,7 +7976,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d23, d11\n\t" #endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 8 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d4, #50\n\t" "vsri.u64 d8, d4, #14\n\t" "vshl.u64 d9, d0, #36\n\t" @@ -8148,7 +8005,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d3, d7\n\t" "vadd.i64 d7, d10\n\t" /* Round 9 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d3, #50\n\t" "vsri.u64 d8, d3, #14\n\t" "vshl.u64 d9, d7, #36\n\t" @@ -8239,7 +8096,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d25, d11\n\t" #endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 10 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d2, #50\n\t" "vsri.u64 d8, d2, #14\n\t" "vshl.u64 d9, d6, #36\n\t" @@ -8268,7 +8125,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d1, d5\n\t" "vadd.i64 d5, d10\n\t" /* Round 11 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d1, #50\n\t" "vsri.u64 d8, d1, #14\n\t" "vshl.u64 d9, d5, #36\n\t" @@ -8359,7 +8216,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d27, d11\n\t" #endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 12 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d0, #50\n\t" "vsri.u64 d8, d0, #14\n\t" "vshl.u64 d9, d4, #36\n\t" @@ -8388,7 +8245,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d7, d3\n\t" "vadd.i64 d3, d10\n\t" /* Round 13 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d7, #50\n\t" "vsri.u64 d8, d7, #14\n\t" "vshl.u64 d9, d3, #36\n\t" @@ -8479,7 +8336,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d29, d11\n\t" #endif /* WOLFSSL_ARM_ARCH_NEON_64BIT */ /* Round 14 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d6, #50\n\t" "vsri.u64 d8, d6, #14\n\t" "vshl.u64 d9, d2, #36\n\t" @@ -8508,7 +8365,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d5, d1\n\t" "vadd.i64 d1, d10\n\t" /* Round 15 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d5, #50\n\t" "vsri.u64 d8, d5, #14\n\t" "vshl.u64 d9, d1, #36\n\t" @@ -8601,7 +8458,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "subs r12, r12, #1\n\t" "bne L_SHA512_transform_neon_len_start_%=\n\t" /* Round 0 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d4, #50\n\t" "vsri.u64 d8, d4, #14\n\t" "vshl.u64 d9, d0, #36\n\t" @@ -8630,7 +8487,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d3, d7\n\t" "vadd.i64 d7, d10\n\t" /* Round 1 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d3, #50\n\t" "vsri.u64 d8, d3, #14\n\t" "vshl.u64 d9, d7, #36\n\t" @@ -8659,7 +8516,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d2, d6\n\t" "vadd.i64 d6, d10\n\t" /* Round 2 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d2, #50\n\t" "vsri.u64 d8, d2, #14\n\t" "vshl.u64 d9, d6, #36\n\t" @@ -8688,7 +8545,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d1, d5\n\t" "vadd.i64 d5, d10\n\t" /* Round 3 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d1, #50\n\t" "vsri.u64 d8, d1, #14\n\t" "vshl.u64 d9, d5, #36\n\t" @@ -8717,7 +8574,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d0, d4\n\t" "vadd.i64 d4, d10\n\t" /* Round 4 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d0, #50\n\t" "vsri.u64 d8, d0, #14\n\t" "vshl.u64 d9, d4, #36\n\t" @@ -8746,7 +8603,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d7, d3\n\t" "vadd.i64 d3, d10\n\t" /* Round 5 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d7, #50\n\t" "vsri.u64 d8, d7, #14\n\t" "vshl.u64 d9, d3, #36\n\t" @@ -8775,7 +8632,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d6, d2\n\t" "vadd.i64 d2, d10\n\t" /* Round 6 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d6, #50\n\t" "vsri.u64 d8, d6, #14\n\t" "vshl.u64 d9, d2, #36\n\t" @@ -8804,7 +8661,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d5, d1\n\t" "vadd.i64 d1, d10\n\t" /* Round 7 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d5, #50\n\t" "vsri.u64 d8, d5, #14\n\t" "vshl.u64 d9, d1, #36\n\t" @@ -8833,7 +8690,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d4, d0\n\t" "vadd.i64 d0, d10\n\t" /* Round 8 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d4, #50\n\t" "vsri.u64 d8, d4, #14\n\t" "vshl.u64 d9, d0, #36\n\t" @@ -8862,7 +8719,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d3, d7\n\t" "vadd.i64 d7, d10\n\t" /* Round 9 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d3, #50\n\t" "vsri.u64 d8, d3, #14\n\t" "vshl.u64 d9, d7, #36\n\t" @@ -8891,7 +8748,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d2, d6\n\t" "vadd.i64 d6, d10\n\t" /* Round 10 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d2, #50\n\t" "vsri.u64 d8, d2, #14\n\t" "vshl.u64 d9, d6, #36\n\t" @@ -8920,7 +8777,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d1, d5\n\t" "vadd.i64 d5, d10\n\t" /* Round 11 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d1, #50\n\t" "vsri.u64 d8, d1, #14\n\t" "vshl.u64 d9, d5, #36\n\t" @@ -8949,7 +8806,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d0, d4\n\t" "vadd.i64 d4, d10\n\t" /* Round 12 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d0, #50\n\t" "vsri.u64 d8, d0, #14\n\t" "vshl.u64 d9, d4, #36\n\t" @@ -8978,7 +8835,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d7, d3\n\t" "vadd.i64 d3, d10\n\t" /* Round 13 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d7, #50\n\t" "vsri.u64 d8, d7, #14\n\t" "vshl.u64 d9, d3, #36\n\t" @@ -9007,7 +8864,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d6, d2\n\t" "vadd.i64 d2, d10\n\t" /* Round 14 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d6, #50\n\t" "vsri.u64 d8, d6, #14\n\t" "vshl.u64 d9, d2, #36\n\t" @@ -9036,7 +8893,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "vadd.i64 d5, d1\n\t" "vadd.i64 d1, d10\n\t" /* Round 15 */ - "vld1.64 {d12}, [r3]!\n\t" + "vld1.64 {d12}, [r3:64]!\n\t" "vshl.u64 d8, d5, #50\n\t" "vsri.u64 d8, d5, #14\n\t" "vshl.u64 d9, d1, #36\n\t" @@ -9085,12 +8942,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "subs %[len], %[len], #0x80\n\t" "bne L_SHA512_transform_neon_len_begin_%=\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) - : [L_SHA512_transform_neon_len_k] "r" (L_SHA512_transform_neon_len_k) + : [L_SHA512_transform_neon_len_k] "g" (L_SHA512_transform_neon_len_k) : "memory", "r3", "r12", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #endif /* !WOLFSSL_ARMASM_NO_NEON */ #endif /* WOLFSSL_SHA512 */ -#endif /* !__aarch64__ */ +#endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 07397d1451..e3a0ebaa48 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -4687,7 +4687,7 @@ static void GHASH_FINAL(Aes* aes, byte* s, word32 sSz) if (aes->cOver > 0) { /* Cipher text block incomplete. */ - over = aes->cOver; + over = aes->cOver; } if (over > 0) { /* Zeroize the unused part of the block. */ @@ -6038,7 +6038,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) XMEMSET(iv, 0, AES_BLOCK_SIZE); ret = wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION); - + if (ret == 0) { AES_ECB_encrypt(iv, aes->gcm.H, AES_BLOCK_SIZE, (const unsigned char*)aes->key, aes->rounds); diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index 13afe2c947..94e645049f 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -29,7 +29,7 @@ #include -#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) #ifdef HAVE_CHACHA #include @@ -2896,4 +2896,4 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, } #endif /* HAVE_CHACHA */ -#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index dafa2f7575..6978d9d3ef 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -30,7 +30,9 @@ */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ -#ifdef HAVE_CURVE25519 +#ifndef WOLFSSL_ARMASM_INLINE +#if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) +#if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) #ifndef __APPLE__ .text .globl fe_init @@ -47,6 +49,7 @@ _fe_init: #ifndef __APPLE__ .size fe_init,.-fe_init #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_frombytes @@ -180,16 +183,17 @@ _fe_sub: sbcs x4, x4, x8 sbcs x5, x5, x9 sbcs x6, x6, x10 - mov x12, #-19 csetm x11, cc + mov x12, #-19 # Mask the modulus - and x12, x11, x12 - and x13, x11, #0x7fffffffffffffff + extr x11, x11, x6, #63 + mul x12, x11, x12 # Add modulus (if underflow) - adds x3, x3, x12 - adcs x4, x4, x11 - adcs x5, x5, x11 - adc x6, x6, x13 + subs x3, x3, x12 + sbcs x4, x4, xzr + and x6, x6, #0x7fffffffffffffff + sbcs x5, x5, xzr + sbc x6, x6, xzr stp x3, x4, [x0] stp x5, x6, [x0, #16] ret @@ -216,17 +220,18 @@ _fe_add: adds x3, x3, x7 adcs x4, x4, x8 adcs x5, x5, x9 - adc x6, x6, x10 - mov x12, #-19 - asr x11, x6, #63 + adcs x6, x6, x10 + cset x11, cs + mov x12, #19 # Mask the modulus - and x12, x11, x12 - and x13, x11, #0x7fffffffffffffff + extr x11, x11, x6, #63 + mul x12, x11, x12 # Sub modulus (if overflow) - subs x3, x3, x12 - sbcs x4, x4, x11 - sbcs x5, x5, x11 - sbc x6, x6, x13 + adds x3, x3, x12 + adcs x4, x4, xzr + and x6, x6, #0x7fffffffffffffff + adcs x5, x5, xzr + adc x6, x6, xzr stp x3, x4, [x0] stp x5, x6, [x0, #16] ret @@ -553,6 +558,7 @@ _fe_cmov_table: #ifndef __APPLE__ .size fe_cmov_table,.-fe_cmov_table #endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ #ifndef __APPLE__ .text .globl fe_mul @@ -576,136 +582,122 @@ _fe_mul: ldp x16, x17, [x1, #16] ldp x19, x20, [x2] ldp x21, x22, [x2, #16] - # A[0] * B[0] - mul x6, x14, x19 + # A[0] * B[0] umulh x7, x14, x19 - # A[0] * B[1] - mul x3, x14, x20 - umulh x8, x14, x20 - adds x7, x7, x3 - adc x8, x8, xzr - # A[1] * B[0] + mul x6, x14, x19 + # A[2] * B[0] + umulh x9, x16, x19 + mul x8, x16, x19 + # A[1] * B[0] mul x3, x15, x19 + adds x7, x7, x3 umulh x4, x15, x19 + adcs x8, x8, x4 + adc x9, x9, xzr + # A[1] * B[3] + umulh x11, x15, x22 + mul x10, x15, x22 + # A[0] * B[1] + mul x3, x14, x20 adds x7, x7, x3 + umulh x4, x14, x20 adcs x8, x8, x4 - adc x9, xzr, xzr - # A[0] * B[2] - mul x3, x14, x21 - umulh x4, x14, x21 - adds x8, x8, x3 - adc x9, x9, x4 - # A[1] * B[1] - mul x3, x15, x20 - umulh x4, x15, x20 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, xzr, xzr - # A[2] * B[0] - mul x3, x16, x19 - umulh x4, x16, x19 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x10, x10, xzr - # A[0] * B[3] - mul x3, x14, x22 - umulh x4, x14, x22 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, xzr, xzr - # A[1] * B[2] - mul x3, x15, x21 - umulh x4, x15, x21 - adds x9, x9, x3 - adcs x10, x10, x4 - adc x11, x11, xzr - # A[2] * B[1] + # A[2] * B[1] mul x3, x16, x20 + adcs x9, x9, x3 umulh x4, x16, x20 - adds x9, x9, x3 adcs x10, x10, x4 adc x11, x11, xzr - # A[3] * B[0] - mul x3, x17, x19 - umulh x4, x17, x19 + # A[1] * B[2] + mul x3, x15, x21 adds x9, x9, x3 + umulh x4, x15, x21 adcs x10, x10, x4 - adc x11, x11, xzr - # A[1] * B[3] - mul x3, x15, x22 - umulh x4, x15, x22 - adds x10, x10, x3 - adcs x11, x11, x4 + adcs x11, x11, xzr adc x12, xzr, xzr - # A[2] * B[2] - mul x3, x16, x21 - umulh x4, x16, x21 - adds x10, x10, x3 - adcs x11, x11, x4 + # A[0] * B[2] + mul x3, x14, x21 + adds x8, x8, x3 + umulh x4, x14, x21 + adcs x9, x9, x4 + adcs x10, x10, xzr + adcs x11, x11, xzr adc x12, x12, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x3, x15, x20 + adds x8, x8, x3 + umulh x4, x15, x20 + adcs x9, x9, x4 + # A[3] * B[1] mul x3, x17, x20 + adcs x10, x10, x3 umulh x4, x17, x20 - adds x10, x10, x3 adcs x11, x11, x4 adc x12, x12, xzr - # A[2] * B[3] + # A[2] * B[2] + mul x3, x16, x21 + adds x10, x10, x3 + umulh x4, x16, x21 + adcs x11, x11, x4 + # A[3] * B[3] + mul x3, x17, x22 + adcs x12, x12, x3 + umulh x13, x17, x22 + adc x13, x13, xzr + # A[0] * B[3] + mul x3, x14, x22 + adds x9, x9, x3 + umulh x4, x14, x22 + adcs x10, x10, x4 + # A[2] * B[3] mul x3, x16, x22 + adcs x11, x11, x3 umulh x4, x16, x22 - adds x11, x11, x3 adcs x12, x12, x4 - adc x13, xzr, xzr - # A[3] * B[2] + adc x13, x13, xzr + # A[3] * B[0] + mul x3, x17, x19 + adds x9, x9, x3 + umulh x4, x17, x19 + adcs x10, x10, x4 + # A[3] * B[2] mul x3, x17, x21 + adcs x11, x11, x3 umulh x4, x17, x21 - adds x11, x11, x3 adcs x12, x12, x4 adc x13, x13, xzr - # A[3] * B[3] - mul x3, x17, x22 - umulh x4, x17, x22 - adds x12, x12, x3 - adc x13, x13, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 + mov x3, #38 + mul x4, x3, x13 + adds x9, x9, x4 + umulh x5, x3, x13 + adc x5, x5, xzr mov x3, #19 + extr x5, x5, x9, #63 + mul x5, x5, x3 + and x9, x9, #0x7fffffffffffffff + mov x3, #38 mul x4, x3, x10 - umulh x10, x3, x10 adds x6, x6, x4 + umulh x10, x3, x10 mul x4, x3, x11 - umulh x11, x3, x11 adcs x7, x7, x4 + umulh x11, x3, x11 mul x4, x3, x12 - umulh x12, x3, x12 adcs x8, x8, x4 - mul x4, x3, x13 - umulh x5, x3, x13 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x10 - adcs x8, x8, x11 - adcs x9, x9, x12 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr + umulh x12, x3, x12 adc x9, x9, xzr - # Reduce if top bit set - and x5, x3, x9, asr 63 - and x9, x9, #0x7fffffffffffffff + # Add high product results in adds x6, x6, x5 + adcs x7, x7, x10 + adcs x8, x8, x11 + adc x9, x9, x12 + # Reduce if top bit set + mov x3, #19 + and x4, x3, x9, asr 63 + adds x6, x6, x4 adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff adcs x8, x8, xzr adc x9, x9, xzr # Store @@ -736,33 +728,30 @@ _fe_sq: ldp x13, x14, [x1] ldp x15, x16, [x1, #16] # A[0] * A[1] - mul x6, x13, x14 umulh x7, x13, x14 + mul x6, x13, x14 + # A[0] * A[3] + umulh x9, x13, x16 + mul x8, x13, x16 # A[0] * A[2] mul x2, x13, x15 - umulh x8, x13, x15 adds x7, x7, x2 - adc x8, x8, xzr - # A[0] * A[3] - mul x2, x13, x16 - umulh x9, x13, x16 - adds x8, x8, x2 - adc x9, x9, xzr + umulh x3, x13, x15 + adcs x8, x8, x3 + # A[1] * A[3] + mul x2, x14, x16 + adcs x9, x9, x2 + umulh x10, x14, x16 + adc x10, x10, xzr # A[1] * A[2] mul x2, x14, x15 - umulh x3, x14, x15 adds x8, x8, x2 + umulh x3, x14, x15 adcs x9, x9, x3 - adc x10, xzr, xzr - # A[1] * A[3] - mul x2, x14, x16 - umulh x3, x14, x16 - adds x9, x9, x2 - adc x10, x10, x3 # A[2] * A[3] mul x2, x15, x16 + adcs x10, x10, x2 umulh x11, x15, x16 - adds x10, x10, x2 adc x11, x11, xzr # Double adds x6, x6, x6 @@ -773,66 +762,56 @@ _fe_sq: adcs x11, x11, x11 adc x12, xzr, xzr # A[0] * A[0] + umulh x3, x13, x13 mul x5, x13, x13 - umulh x4, x13, x13 # A[1] * A[1] mul x2, x14, x14 + adds x6, x6, x3 umulh x3, x14, x14 - adds x6, x6, x4 adcs x7, x7, x2 - adc x4, x3, xzr # A[2] * A[2] mul x2, x15, x15 + adcs x8, x8, x3 umulh x3, x15, x15 - adds x8, x8, x4 adcs x9, x9, x2 - adc x4, x3, xzr # A[3] * A[3] mul x2, x16, x16 + adcs x10, x10, x3 umulh x3, x16, x16 - adds x10, x10, x4 adcs x11, x11, x2 adc x12, x12, x3 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x12, x12, x11, #63 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - and x8, x8, #0x7fffffffffffffff - # Multiply top half by 19 + mov x2, #38 + mul x3, x2, x12 + adds x8, x8, x3 + umulh x4, x2, x12 + adc x4, x4, xzr mov x2, #19 + extr x4, x4, x8, #63 + mul x4, x4, x2 + and x8, x8, #0x7fffffffffffffff + mov x2, #38 mul x3, x2, x9 - umulh x9, x2, x9 adds x5, x5, x3 + umulh x9, x2, x9 mul x3, x2, x10 - umulh x10, x2, x10 adcs x6, x6, x3 + umulh x10, x2, x10 mul x3, x2, x11 - umulh x11, x2, x11 adcs x7, x7, x3 - mul x3, x2, x12 - umulh x4, x2, x12 - adcs x8, x8, x3 - adc x4, x4, xzr - # Add remaining product results in - adds x6, x6, x9 - adcs x7, x7, x10 - adcs x8, x8, x11 - adc x4, x4, xzr - # Overflow - extr x4, x4, x8, #63 - mul x4, x4, x2 - and x8, x8, #0x7fffffffffffffff - adds x5, x5, x4 - adcs x6, x6, xzr - adcs x7, x7, xzr + umulh x11, x2, x11 adc x8, x8, xzr - # Reduce if top bit set - and x4, x2, x8, asr 63 - and x8, x8, #0x7fffffffffffffff + # Add high product results in adds x5, x5, x4 + adcs x6, x6, x9 + adcs x7, x7, x10 + adc x8, x8, x11 + # Reduce if top bit set + mov x2, #19 + and x3, x2, x8, asr 63 + adds x5, x5, x3 adcs x6, x6, xzr + and x8, x8, #0x7fffffffffffffff adcs x7, x7, xzr adc x8, x8, xzr # Store @@ -856,6 +835,7 @@ _fe_invert: #endif /* __APPLE__ */ stp x29, x30, [sp, #-176]! add x29, sp, #0 + str x17, [x29, #160] str x20, [x29, #168] # Invert str x0, [x29, #144] @@ -920,61 +900,197 @@ _fe_invert: #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x50 + # Loop: 5 times + mov x20, #5 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] +L_fe_invert1: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x20, x20, #1 + bne L_fe_invert1 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] #ifndef NDEBUG - add x1, x29, #48 + add x0, x29, #48 #endif /* !NDEBUG */ + add x1, x29, #0x50 + add x2, x29, #48 #ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x20, #3 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ - add x1, x29, #0x50 -L_fe_invert1: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x20, x20, #1 - bcs L_fe_invert1 - add x0, x29, #48 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ - add x2, x29, #48 -#ifndef __APPLE__ - bl fe_mul + bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x50 - add x1, x29, #48 -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x20, #8 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ - add x1, x29, #0x50 + # Loop: 10 times + mov x20, #10 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] L_fe_invert2: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x20, x20, #1 - bcs L_fe_invert2 -#ifndef NDEBUG + bne L_fe_invert2 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] add x0, x29, #0x50 -#endif /* !NDEBUG */ #ifndef NDEBUG add x1, x29, #0x50 #endif /* !NDEBUG */ @@ -984,139 +1100,295 @@ L_fe_invert2: #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x70 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x20, #18 -#ifndef NDEBUG - add x0, x29, #0x70 -#endif /* !NDEBUG */ - add x1, x29, #0x70 + # Loop: 20 times + mov x20, #20 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] L_fe_invert3: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x20, x20, #1 - bcs L_fe_invert3 - add x0, x29, #0x50 + bne L_fe_invert3 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] #ifndef NDEBUG - add x1, x29, #0x70 + add x0, x29, #0x50 #endif /* !NDEBUG */ + add x1, x29, #0x70 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - mov x20, #9 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ - add x1, x29, #0x50 + # Loop: 10 times + mov x20, #10 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] L_fe_invert4: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x20, x20, #1 - bcs L_fe_invert4 - add x0, x29, #48 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ - add x2, x29, #48 -#ifndef __APPLE__ - bl fe_mul -#else - bl _fe_mul -#endif /* __APPLE__ */ - add x0, x29, #0x50 - add x1, x29, #48 -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x20, #48 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ - add x1, x29, #0x50 -L_fe_invert5: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x20, x20, #1 - bcs L_fe_invert5 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ - add x2, x29, #48 -#ifndef __APPLE__ - bl fe_mul -#else - bl _fe_mul -#endif /* __APPLE__ */ - add x0, x29, #0x70 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x20, #0x62 -#ifndef NDEBUG - add x0, x29, #0x70 -#endif /* !NDEBUG */ - add x1, x29, #0x70 -L_fe_invert6: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x20, x20, #1 - bcs L_fe_invert6 - add x0, x29, #0x50 -#ifndef NDEBUG - add x1, x29, #0x70 -#endif /* !NDEBUG */ - add x2, x29, #0x50 + bne L_fe_invert4 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + add x0, x29, #48 + add x1, x29, #0x50 + add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - mov x20, #49 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ - add x1, x29, #0x50 -L_fe_invert7: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Loop: 50 times + mov x20, #50 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] +L_fe_invert5: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x20, x20, #1 - bcs L_fe_invert7 - add x0, x29, #48 + bne L_fe_invert5 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + add x0, x29, #0x50 #ifndef NDEBUG add x1, x29, #0x50 #endif /* !NDEBUG */ @@ -1126,47 +1398,321 @@ L_fe_invert7: #else bl _fe_mul #endif /* __APPLE__ */ - mov x20, #4 -#ifndef NDEBUG - add x0, x29, #48 -#endif /* !NDEBUG */ - add x1, x29, #48 -L_fe_invert8: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Loop: 100 times + mov x20, #0x64 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] +L_fe_invert6: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x20, x20, #1 - bcs L_fe_invert8 - ldr x0, [x29, #144] + bne L_fe_invert6 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] #ifndef NDEBUG - add x1, x29, #48 + add x0, x29, #0x50 #endif /* !NDEBUG */ - add x2, x29, #16 + add x1, x29, #0x70 + add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - ldr x20, [x29, #168] - ldp x29, x30, [sp], #0xb0 - ret -#ifndef __APPLE__ - .size fe_invert,.-fe_invert -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl curve25519 -.type curve25519,@function -.align 2 -curve25519: -#else -.section __TEXT,__text -.globl _curve25519 -.p2align 2 -_curve25519: -#endif /* __APPLE__ */ + # Loop: 50 times + mov x20, #50 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] +L_fe_invert7: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x20, x20, #1 + bne L_fe_invert7 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + add x0, x29, #48 + add x1, x29, #0x50 + add x2, x29, #48 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + # Loop: 5 times + mov x20, #5 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] +L_fe_invert8: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x20, x20, #1 + bne L_fe_invert8 + # Store + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] + ldr x0, [x29, #144] + add x1, x29, #48 + add x2, x29, #16 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + ldr x17, [x29, #160] + ldr x20, [x29, #168] + ldp x29, x30, [sp], #0xb0 + ret +#ifndef __APPLE__ + .size fe_invert,.-fe_invert +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl curve25519 +.type curve25519,@function +.align 2 +curve25519: +#else +.section __TEXT,__text +.globl _curve25519 +.p2align 2 +_curve25519: +#endif /* __APPLE__ */ stp x29, x30, [sp, #-288]! add x29, sp, #0 str x17, [x29, #200] @@ -1179,422 +1725,364 @@ _curve25519: mov x23, xzr str x0, [x29, #176] str x2, [x29, #184] - # Copy ldp x6, x7, [x2] ldp x8, x9, [x2, #16] - stp x6, x7, [x29, #80] - stp x8, x9, [x29, #96] - # Set one - mov x2, #1 - stp x2, xzr, [x0] - stp xzr, xzr, [x0, #16] + mov x10, #1 + mov x11, xzr + mov x12, xzr + mov x13, xzr + stp x10, x11, [x0] + stp x12, x13, [x0, #16] # Set zero stp xzr, xzr, [x29, #16] stp xzr, xzr, [x29, #32] - # Set one - mov x2, #1 - stp x2, xzr, [x29, #48] - stp xzr, xzr, [x29, #64] - mov x25, #62 - mov x24, #24 -L_curve25519_words: + mov x24, #0xfe L_curve25519_bits: - ldr x2, [x1, x24] - lsr x2, x2, x25 - and x2, x2, #1 - eor x23, x23, x2 + lsr x3, x24, #6 + and x4, x24, #63 + ldr x5, [x1, x3, LSL 3] + lsr x5, x5, x4 + eor x23, x23, x5 + # Conditional Swap + subs xzr, xzr, x23, lsl 63 + ldp x25, x26, [x29, #16] + ldp x27, x28, [x29, #32] + csel x19, x25, x10, ne + csel x25, x10, x25, ne + csel x20, x26, x11, ne + csel x26, x11, x26, ne + csel x21, x27, x12, ne + csel x27, x12, x27, ne + csel x22, x28, x13, ne + csel x28, x13, x28, ne # Conditional Swap - cmp x23, #1 + subs xzr, xzr, x23, lsl 63 ldp x10, x11, [x0] ldp x12, x13, [x0, #16] - ldp x6, x7, [x29, #80] - ldp x8, x9, [x29, #96] - csel x14, x10, x6, eq - csel x10, x6, x10, eq - csel x15, x11, x7, eq - csel x11, x7, x11, eq - csel x16, x12, x8, eq - csel x12, x8, x12, eq - csel x17, x13, x9, eq - csel x13, x9, x13, eq - # Conditional Swap - cmp x23, #1 - ldp x19, x20, [x29, #16] - ldp x21, x22, [x29, #32] - ldp x6, x7, [x29, #48] - ldp x8, x9, [x29, #64] - csel x5, x19, x6, eq - csel x19, x6, x19, eq - csel x26, x20, x7, eq - csel x20, x7, x20, eq - csel x27, x21, x8, eq - csel x21, x8, x21, eq - csel x28, x22, x9, eq - csel x22, x9, x22, eq - mov x23, x2 + csel x14, x10, x6, ne + csel x10, x6, x10, ne + csel x15, x11, x7, ne + csel x11, x7, x11, ne + csel x16, x12, x8, ne + csel x12, x8, x12, ne + csel x17, x13, x9, ne + csel x13, x9, x13, ne + mov x23, x5 # Add - adds x6, x10, x19 - adcs x7, x11, x20 - adcs x8, x12, x21 - adc x9, x13, x22 - mov x3, #-19 - asr x2, x9, #63 - # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + adds x6, x10, x25 + adcs x7, x11, x26 + adcs x8, x12, x27 + adcs x9, x13, x28 + cset x5, cs + mov x3, #19 + extr x5, x5, x9, #63 + mul x3, x5, x3 # Sub modulus (if overflow) - subs x6, x6, x3 - sbcs x7, x7, x2 - sbcs x8, x8, x2 - sbc x9, x9, x4 + adds x6, x6, x3 + adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff + adcs x8, x8, xzr + adc x9, x9, xzr # Sub - subs x19, x10, x19 - sbcs x20, x11, x20 - sbcs x21, x12, x21 - sbcs x22, x13, x22 + subs x25, x10, x25 + sbcs x26, x11, x26 + sbcs x27, x12, x27 + sbcs x28, x13, x28 + csetm x5, cc mov x3, #-19 - csetm x2, cc - # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + extr x5, x5, x28, #63 + mul x3, x5, x3 # Add modulus (if underflow) - adds x19, x19, x3 - adcs x20, x20, x2 - adcs x21, x21, x2 - adc x22, x22, x4 - stp x19, x20, [x29, #144] - stp x21, x22, [x29, #160] + subs x25, x25, x3 + sbcs x26, x26, xzr + and x28, x28, #0x7fffffffffffffff + sbcs x27, x27, xzr + sbc x28, x28, xzr + stp x25, x26, [x29, #80] + stp x27, x28, [x29, #96] # Add - adds x10, x14, x5 - adcs x11, x15, x26 - adcs x12, x16, x27 - adc x13, x17, x28 - mov x3, #-19 - asr x2, x13, #63 - # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + adds x10, x14, x19 + adcs x11, x15, x20 + adcs x12, x16, x21 + adcs x13, x17, x22 + cset x5, cs + mov x3, #19 + extr x5, x5, x13, #63 + mul x3, x5, x3 # Sub modulus (if overflow) - subs x10, x10, x3 - sbcs x11, x11, x2 - sbcs x12, x12, x2 - sbc x13, x13, x4 + adds x10, x10, x3 + adcs x11, x11, xzr + and x13, x13, #0x7fffffffffffffff + adcs x12, x12, xzr + adc x13, x13, xzr # Sub - subs x14, x14, x5 - sbcs x15, x15, x26 - sbcs x16, x16, x27 - sbcs x17, x17, x28 + subs x14, x14, x19 + sbcs x15, x15, x20 + sbcs x16, x16, x21 + sbcs x17, x17, x22 + csetm x5, cc mov x3, #-19 - csetm x2, cc - # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + extr x5, x5, x17, #63 + mul x3, x5, x3 # Add modulus (if underflow) - adds x14, x14, x3 - adcs x15, x15, x2 - adcs x16, x16, x2 - adc x17, x17, x4 + subs x14, x14, x3 + sbcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff + sbcs x16, x16, xzr + sbc x17, x17, xzr # Multiply - # A[0] * B[0] - mul x19, x14, x6 + # A[0] * B[0] umulh x20, x14, x6 - # A[0] * B[1] - mul x3, x14, x7 - umulh x21, x14, x7 - adds x20, x20, x3 - adc x21, x21, xzr - # A[1] * B[0] + mul x19, x14, x6 + # A[2] * B[0] + umulh x22, x16, x6 + mul x21, x16, x6 + # A[1] * B[0] mul x3, x15, x6 + adds x20, x20, x3 umulh x4, x15, x6 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[1] * B[3] + umulh x26, x15, x9 + mul x25, x15, x9 + # A[0] * B[1] + mul x3, x14, x7 adds x20, x20, x3 + umulh x4, x14, x7 adcs x21, x21, x4 - adc x22, xzr, xzr - # A[0] * B[2] - mul x3, x14, x8 - umulh x4, x14, x8 - adds x21, x21, x3 - adc x22, x22, x4 - # A[1] * B[1] - mul x3, x15, x7 - umulh x4, x15, x7 - adds x21, x21, x3 - adcs x22, x22, x4 - adc x2, xzr, xzr - # A[2] * B[0] - mul x3, x16, x6 - umulh x4, x16, x6 - adds x21, x21, x3 - adcs x22, x22, x4 - adc x2, x2, xzr - # A[0] * B[3] - mul x3, x14, x9 - umulh x4, x14, x9 - adds x22, x22, x3 - adcs x2, x2, x4 - adc x26, xzr, xzr - # A[1] * B[2] - mul x3, x15, x8 - umulh x4, x15, x8 - adds x22, x22, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[2] * B[1] + # A[2] * B[1] mul x3, x16, x7 + adcs x22, x22, x3 umulh x4, x16, x7 - adds x22, x22, x3 - adcs x2, x2, x4 + adcs x25, x25, x4 adc x26, x26, xzr - # A[3] * B[0] - mul x3, x17, x6 - umulh x4, x17, x6 + # A[1] * B[2] + mul x3, x15, x8 adds x22, x22, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[1] * B[3] - mul x3, x15, x9 - umulh x4, x15, x9 - adds x2, x2, x3 - adcs x26, x26, x4 + umulh x4, x15, x8 + adcs x25, x25, x4 + adcs x26, x26, xzr adc x27, xzr, xzr - # A[2] * B[2] - mul x3, x16, x8 - umulh x4, x16, x8 - adds x2, x2, x3 - adcs x26, x26, x4 + # A[0] * B[2] + mul x3, x14, x8 + adds x21, x21, x3 + umulh x4, x14, x8 + adcs x22, x22, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr adc x27, x27, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x3, x15, x7 + adds x21, x21, x3 + umulh x4, x15, x7 + adcs x22, x22, x4 + # A[3] * B[1] mul x3, x17, x7 + adcs x25, x25, x3 umulh x4, x17, x7 - adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr - # A[2] * B[3] - mul x3, x16, x9 - umulh x4, x16, x9 - adds x26, x26, x3 - adcs x27, x27, x4 - adc x28, xzr, xzr - # A[3] * B[2] - mul x3, x17, x8 - umulh x4, x17, x8 - adds x26, x26, x3 + # A[2] * B[2] + mul x3, x16, x8 + adds x25, x25, x3 + umulh x4, x16, x8 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x9 + adcs x27, x27, x3 + umulh x28, x17, x9 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x9 + adds x22, x22, x3 + umulh x4, x14, x9 + adcs x25, x25, x4 + # A[2] * B[3] + mul x3, x16, x9 + adcs x26, x26, x3 + umulh x4, x16, x9 + adcs x27, x27, x4 + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x6 + adds x22, x22, x3 + umulh x4, x17, x6 + adcs x25, x25, x4 + # A[3] * B[2] + mul x3, x17, x8 + adcs x26, x26, x3 + umulh x4, x17, x8 adcs x27, x27, x4 adc x28, x28, xzr - # A[3] * B[3] - mul x3, x17, x9 - umulh x4, x17, x9 - adds x27, x27, x3 - adc x28, x28, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x28, x28, x27, #63 - extr x27, x27, x26, #63 - extr x26, x26, x2, #63 - extr x2, x2, x22, #63 - and x22, x22, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x2 - umulh x2, x3, x2 - adds x19, x19, x4 - mul x4, x3, x26 - umulh x26, x3, x26 - adcs x20, x20, x4 - mul x4, x3, x27 - umulh x27, x3, x27 - adcs x21, x21, x4 + mov x3, #38 mul x4, x3, x28 + adds x22, x22, x4 umulh x5, x3, x28 - adcs x22, x22, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x20, x20, x2 - adcs x21, x21, x26 - adcs x22, x22, x27 adc x5, x5, xzr - # Overflow + mov x3, #19 extr x5, x5, x22, #63 mul x5, x5, x3 and x22, x22, #0x7fffffffffffffff - adds x19, x19, x5 - adcs x20, x20, xzr - adcs x21, x21, xzr + mov x3, #38 + mul x4, x3, x25 + adds x19, x19, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x20, x20, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x21, x21, x4 + umulh x27, x3, x27 adc x22, x22, xzr - # Reduce if top bit set - and x5, x3, x22, asr 63 - and x22, x22, #0x7fffffffffffffff + # Add high product results in adds x19, x19, x5 - adcs x20, x20, xzr - adcs x21, x21, xzr - adc x22, x22, xzr + adcs x20, x20, x25 + adcs x21, x21, x26 + adc x22, x22, x27 # Store - stp x19, x20, [x29, #112] - stp x21, x22, [x29, #128] + stp x19, x20, [x29, #48] + stp x21, x22, [x29, #64] # Multiply - ldp x2, x26, [x29, #144] - ldp x27, x28, [x29, #160] - # A[0] * B[0] - mul x19, x10, x2 - umulh x20, x10, x2 - # A[0] * B[1] - mul x3, x10, x26 - umulh x21, x10, x26 + ldp x25, x26, [x29, #80] + ldp x27, x28, [x29, #96] + # A[0] * B[0] + umulh x20, x10, x25 + mul x19, x10, x25 + # A[2] * B[0] + umulh x22, x12, x25 + mul x21, x12, x25 + # A[1] * B[0] + mul x3, x11, x25 adds x20, x20, x3 - adc x21, x21, xzr - # A[1] * B[0] - mul x3, x11, x2 - umulh x4, x11, x2 + umulh x4, x11, x25 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[1] * B[3] + umulh x15, x11, x28 + mul x14, x11, x28 + # A[0] * B[1] + mul x3, x10, x26 adds x20, x20, x3 + umulh x4, x10, x26 adcs x21, x21, x4 - adc x22, xzr, xzr - # A[0] * B[2] - mul x3, x10, x27 - umulh x4, x10, x27 - adds x21, x21, x3 - adc x22, x22, x4 - # A[1] * B[1] - mul x3, x11, x26 - umulh x4, x11, x26 - adds x21, x21, x3 - adcs x22, x22, x4 - adc x14, xzr, xzr - # A[2] * B[0] - mul x3, x12, x2 - umulh x4, x12, x2 - adds x21, x21, x3 - adcs x22, x22, x4 - adc x14, x14, xzr - # A[0] * B[3] - mul x3, x10, x28 - umulh x4, x10, x28 - adds x22, x22, x3 - adcs x14, x14, x4 - adc x15, xzr, xzr - # A[1] * B[2] - mul x3, x11, x27 - umulh x4, x11, x27 - adds x22, x22, x3 - adcs x14, x14, x4 - adc x15, x15, xzr - # A[2] * B[1] + # A[2] * B[1] mul x3, x12, x26 + adcs x22, x22, x3 umulh x4, x12, x26 - adds x22, x22, x3 adcs x14, x14, x4 adc x15, x15, xzr - # A[3] * B[0] - mul x3, x13, x2 - umulh x4, x13, x2 + # A[1] * B[2] + mul x3, x11, x27 adds x22, x22, x3 + umulh x4, x11, x27 adcs x14, x14, x4 - adc x15, x15, xzr - # A[1] * B[3] - mul x3, x11, x28 - umulh x4, x11, x28 - adds x14, x14, x3 - adcs x15, x15, x4 + adcs x15, x15, xzr adc x16, xzr, xzr - # A[2] * B[2] - mul x3, x12, x27 - umulh x4, x12, x27 - adds x14, x14, x3 - adcs x15, x15, x4 + # A[0] * B[2] + mul x3, x10, x27 + adds x21, x21, x3 + umulh x4, x10, x27 + adcs x22, x22, x4 + adcs x14, x14, xzr + adcs x15, x15, xzr adc x16, x16, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x3, x11, x26 + adds x21, x21, x3 + umulh x4, x11, x26 + adcs x22, x22, x4 + # A[3] * B[1] mul x3, x13, x26 + adcs x14, x14, x3 umulh x4, x13, x26 - adds x14, x14, x3 adcs x15, x15, x4 adc x16, x16, xzr - # A[2] * B[3] + # A[2] * B[2] + mul x3, x12, x27 + adds x14, x14, x3 + umulh x4, x12, x27 + adcs x15, x15, x4 + # A[3] * B[3] + mul x3, x13, x28 + adcs x16, x16, x3 + umulh x17, x13, x28 + adc x17, x17, xzr + # A[0] * B[3] + mul x3, x10, x28 + adds x22, x22, x3 + umulh x4, x10, x28 + adcs x14, x14, x4 + # A[2] * B[3] mul x3, x12, x28 + adcs x15, x15, x3 umulh x4, x12, x28 - adds x15, x15, x3 adcs x16, x16, x4 - adc x17, xzr, xzr - # A[3] * B[2] + adc x17, x17, xzr + # A[3] * B[0] + mul x3, x13, x25 + adds x22, x22, x3 + umulh x4, x13, x25 + adcs x14, x14, x4 + # A[3] * B[2] mul x3, x13, x27 + adcs x15, x15, x3 umulh x4, x13, x27 - adds x15, x15, x3 adcs x16, x16, x4 adc x17, x17, xzr - # A[3] * B[3] - mul x3, x13, x28 - umulh x4, x13, x28 - adds x16, x16, x3 - adc x17, x17, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x17, x17, x16, #63 - extr x16, x16, x15, #63 - extr x15, x15, x14, #63 - extr x14, x14, x22, #63 - and x22, x22, #0x7fffffffffffffff - # Multiply top half by 19 + mov x3, #38 + mul x4, x3, x17 + adds x22, x22, x4 + umulh x5, x3, x17 + adc x5, x5, xzr mov x3, #19 + extr x5, x5, x22, #63 + mul x5, x5, x3 + and x22, x22, #0x7fffffffffffffff + mov x3, #38 mul x4, x3, x14 - umulh x14, x3, x14 adds x19, x19, x4 + umulh x14, x3, x14 mul x4, x3, x15 - umulh x15, x3, x15 adcs x20, x20, x4 + umulh x15, x3, x15 mul x4, x3, x16 - umulh x16, x3, x16 adcs x21, x21, x4 - mul x4, x3, x17 - umulh x5, x3, x17 - adcs x22, x22, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x20, x20, x14 - adcs x21, x21, x15 - adcs x22, x22, x16 - adc x5, x5, xzr - # Overflow - extr x5, x5, x22, #63 - mul x5, x5, x3 - and x22, x22, #0x7fffffffffffffff - adds x19, x19, x5 - adcs x20, x20, xzr - adcs x21, x21, xzr + umulh x16, x3, x16 adc x22, x22, xzr - # Reduce if top bit set - and x5, x3, x22, asr 63 - and x22, x22, #0x7fffffffffffffff + # Add high product results in adds x19, x19, x5 - adcs x20, x20, xzr - adcs x21, x21, xzr - adc x22, x22, xzr - # Store + adcs x20, x20, x14 + adcs x21, x21, x15 + adc x22, x22, x16 # Square # A[0] * A[1] - mul x11, x2, x26 - umulh x12, x2, x26 + umulh x12, x25, x26 + mul x11, x25, x26 + # A[0] * A[3] + umulh x14, x25, x28 + mul x13, x25, x28 # A[0] * A[2] - mul x3, x2, x27 - umulh x13, x2, x27 + mul x3, x25, x27 adds x12, x12, x3 - adc x13, x13, xzr - # A[0] * A[3] - mul x3, x2, x28 - umulh x14, x2, x28 - adds x13, x13, x3 - adc x14, x14, xzr + umulh x4, x25, x27 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x26, x28 + adcs x14, x14, x3 + umulh x15, x26, x28 + adc x15, x15, xzr # A[1] * A[2] mul x3, x26, x27 - umulh x4, x26, x27 adds x13, x13, x3 + umulh x4, x26, x27 adcs x14, x14, x4 - adc x15, xzr, xzr - # A[1] * A[3] - mul x3, x26, x28 - umulh x4, x26, x28 - adds x14, x14, x3 - adc x15, x15, x4 # A[2] * A[3] mul x3, x27, x28 + adcs x15, x15, x3 umulh x16, x27, x28 - adds x15, x15, x3 adc x16, x16, xzr # Double adds x11, x11, x11 @@ -1605,304 +2093,241 @@ L_curve25519_bits: adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] - mul x10, x2, x2 - umulh x5, x2, x2 + umulh x4, x25, x25 + mul x10, x25, x25 # A[1] * A[1] mul x3, x26, x26 + adds x11, x11, x4 umulh x4, x26, x26 - adds x11, x11, x5 adcs x12, x12, x3 - adc x5, x4, xzr # A[2] * A[2] mul x3, x27, x27 + adcs x13, x13, x4 umulh x4, x27, x27 - adds x13, x13, x5 adcs x14, x14, x3 - adc x5, x4, xzr # A[3] * A[3] mul x3, x28, x28 + adcs x15, x15, x4 umulh x4, x28, x28 - adds x15, x15, x5 adcs x16, x16, x3 adc x17, x17, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x17, x17, x16, #63 - extr x16, x16, x15, #63 - extr x15, x15, x14, #63 - extr x14, x14, x13, #63 - and x13, x13, #0x7fffffffffffffff - # Multiply top half by 19 + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 mul x4, x3, x14 - umulh x14, x3, x14 adds x10, x10, x4 + umulh x14, x3, x14 mul x4, x3, x15 - umulh x15, x3, x15 adcs x11, x11, x4 + umulh x15, x3, x15 mul x4, x3, x16 - umulh x16, x3, x16 adcs x12, x12, x4 - mul x4, x3, x17 - umulh x5, x3, x17 - adcs x13, x13, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x11, x11, x14 - adcs x12, x12, x15 - adcs x13, x13, x16 - adc x5, x5, xzr - # Overflow - extr x5, x5, x13, #63 - mul x5, x5, x3 - and x13, x13, #0x7fffffffffffffff - adds x10, x10, x5 - adcs x11, x11, xzr - adcs x12, x12, xzr + umulh x16, x3, x16 adc x13, x13, xzr - # Reduce if top bit set - and x5, x3, x13, asr 63 - and x13, x13, #0x7fffffffffffffff + # Add high product results in adds x10, x10, x5 - adcs x11, x11, xzr - adcs x12, x12, xzr - adc x13, x13, xzr - # Store + adcs x11, x11, x14 + adcs x12, x12, x15 + adc x13, x13, x16 # Square # A[0] * A[1] - mul x15, x6, x7 umulh x16, x6, x7 + mul x15, x6, x7 + # A[0] * A[3] + umulh x25, x6, x9 + mul x17, x6, x9 # A[0] * A[2] mul x3, x6, x8 - umulh x17, x6, x8 adds x16, x16, x3 - adc x17, x17, xzr - # A[0] * A[3] - mul x3, x6, x9 - umulh x2, x6, x9 - adds x17, x17, x3 - adc x2, x2, xzr + umulh x4, x6, x8 + adcs x17, x17, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x25, x25, x3 + umulh x26, x7, x9 + adc x26, x26, xzr # A[1] * A[2] mul x3, x7, x8 - umulh x4, x7, x8 adds x17, x17, x3 - adcs x2, x2, x4 - adc x26, xzr, xzr - # A[1] * A[3] - mul x3, x7, x9 - umulh x4, x7, x9 - adds x2, x2, x3 - adc x26, x26, x4 + umulh x4, x7, x8 + adcs x25, x25, x4 # A[2] * A[3] mul x3, x8, x9 + adcs x26, x26, x3 umulh x27, x8, x9 - adds x26, x26, x3 adc x27, x27, xzr # Double adds x15, x15, x15 adcs x16, x16, x16 adcs x17, x17, x17 - adcs x2, x2, x2 + adcs x25, x25, x25 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr # A[0] * A[0] + umulh x4, x6, x6 mul x14, x6, x6 - umulh x5, x6, x6 # A[1] * A[1] mul x3, x7, x7 + adds x15, x15, x4 umulh x4, x7, x7 - adds x15, x15, x5 adcs x16, x16, x3 - adc x5, x4, xzr # A[2] * A[2] mul x3, x8, x8 + adcs x17, x17, x4 umulh x4, x8, x8 - adds x17, x17, x5 - adcs x2, x2, x3 - adc x5, x4, xzr + adcs x25, x25, x3 # A[3] * A[3] mul x3, x9, x9 + adcs x26, x26, x4 umulh x4, x9, x9 - adds x26, x26, x5 adcs x27, x27, x3 adc x28, x28, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x28, x28, x27, #63 - extr x27, x27, x26, #63 - extr x26, x26, x2, #63 - extr x2, x2, x17, #63 - and x17, x17, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x2 - umulh x2, x3, x2 - adds x14, x14, x4 - mul x4, x3, x26 - umulh x26, x3, x26 - adcs x15, x15, x4 - mul x4, x3, x27 - umulh x27, x3, x27 - adcs x16, x16, x4 + mov x3, #38 mul x4, x3, x28 + adds x17, x17, x4 umulh x5, x3, x28 - adcs x17, x17, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x15, x15, x2 - adcs x16, x16, x26 - adcs x17, x17, x27 adc x5, x5, xzr - # Overflow + mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff - adds x14, x14, x5 - adcs x15, x15, xzr - adcs x16, x16, xzr + mov x3, #38 + mul x4, x3, x25 + adds x14, x14, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x15, x15, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x16, x16, x4 + umulh x27, x3, x27 adc x17, x17, xzr - # Reduce if top bit set - and x5, x3, x17, asr 63 - and x17, x17, #0x7fffffffffffffff + # Add high product results in adds x14, x14, x5 - adcs x15, x15, xzr - adcs x16, x16, xzr - adc x17, x17, xzr - # Store + adcs x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, x27 # Multiply - # A[0] * B[0] - mul x6, x14, x10 + # A[0] * B[0] umulh x7, x14, x10 - # A[0] * B[1] - mul x3, x14, x11 - umulh x8, x14, x11 - adds x7, x7, x3 - adc x8, x8, xzr - # A[1] * B[0] + mul x6, x14, x10 + # A[2] * B[0] + umulh x9, x16, x10 + mul x8, x16, x10 + # A[1] * B[0] mul x3, x15, x10 + adds x7, x7, x3 umulh x4, x15, x10 + adcs x8, x8, x4 + adc x9, x9, xzr + # A[1] * B[3] + umulh x26, x15, x13 + mul x25, x15, x13 + # A[0] * B[1] + mul x3, x14, x11 adds x7, x7, x3 + umulh x4, x14, x11 adcs x8, x8, x4 - adc x9, xzr, xzr - # A[0] * B[2] - mul x3, x14, x12 - umulh x4, x14, x12 - adds x8, x8, x3 - adc x9, x9, x4 - # A[1] * B[1] - mul x3, x15, x11 - umulh x4, x15, x11 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x2, xzr, xzr - # A[2] * B[0] - mul x3, x16, x10 - umulh x4, x16, x10 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x2, x2, xzr - # A[0] * B[3] - mul x3, x14, x13 - umulh x4, x14, x13 - adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, xzr, xzr - # A[1] * B[2] - mul x3, x15, x12 - umulh x4, x15, x12 - adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[2] * B[1] + # A[2] * B[1] mul x3, x16, x11 + adcs x9, x9, x3 umulh x4, x16, x11 - adds x9, x9, x3 - adcs x2, x2, x4 + adcs x25, x25, x4 adc x26, x26, xzr - # A[3] * B[0] - mul x3, x17, x10 - umulh x4, x17, x10 + # A[1] * B[2] + mul x3, x15, x12 adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[1] * B[3] - mul x3, x15, x13 - umulh x4, x15, x13 - adds x2, x2, x3 - adcs x26, x26, x4 + umulh x4, x15, x12 + adcs x25, x25, x4 + adcs x26, x26, xzr adc x27, xzr, xzr - # A[2] * B[2] - mul x3, x16, x12 - umulh x4, x16, x12 - adds x2, x2, x3 - adcs x26, x26, x4 + # A[0] * B[2] + mul x3, x14, x12 + adds x8, x8, x3 + umulh x4, x14, x12 + adcs x9, x9, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr adc x27, x27, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x3, x15, x11 + adds x8, x8, x3 + umulh x4, x15, x11 + adcs x9, x9, x4 + # A[3] * B[1] mul x3, x17, x11 + adcs x25, x25, x3 umulh x4, x17, x11 - adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr - # A[2] * B[3] + # A[2] * B[2] + mul x3, x16, x12 + adds x25, x25, x3 + umulh x4, x16, x12 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x13 + adcs x27, x27, x3 + umulh x28, x17, x13 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x13 + adds x9, x9, x3 + umulh x4, x14, x13 + adcs x25, x25, x4 + # A[2] * B[3] mul x3, x16, x13 + adcs x26, x26, x3 umulh x4, x16, x13 - adds x26, x26, x3 adcs x27, x27, x4 - adc x28, xzr, xzr - # A[3] * B[2] + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x10 + adds x9, x9, x3 + umulh x4, x17, x10 + adcs x25, x25, x4 + # A[3] * B[2] mul x3, x17, x12 + adcs x26, x26, x3 umulh x4, x17, x12 - adds x26, x26, x3 adcs x27, x27, x4 adc x28, x28, xzr - # A[3] * B[3] - mul x3, x17, x13 - umulh x4, x17, x13 - adds x27, x27, x3 - adc x28, x28, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x28, x28, x27, #63 - extr x27, x27, x26, #63 - extr x26, x26, x2, #63 - extr x2, x2, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x2 - umulh x2, x3, x2 - adds x6, x6, x4 - mul x4, x3, x26 - umulh x26, x3, x26 - adcs x7, x7, x4 - mul x4, x3, x27 - umulh x27, x3, x27 - adcs x8, x8, x4 + mov x3, #38 mul x4, x3, x28 + adds x9, x9, x4 umulh x5, x3, x28 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x2 - adcs x8, x8, x26 - adcs x9, x9, x27 adc x5, x5, xzr - # Overflow + mov x3, #19 extr x5, x5, x9, #63 mul x5, x5, x3 and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr + mov x3, #38 + mul x4, x3, x25 + adds x6, x6, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x7, x7, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x8, x8, x4 + umulh x27, x3, x27 adc x9, x9, xzr - # Reduce if top bit set - and x5, x3, x9, asr 63 - and x9, x9, #0x7fffffffffffffff + # Add high product results in adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr + adcs x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, x27 # Store stp x6, x7, [x0] stp x8, x9, [x0, #16] @@ -1911,576 +2336,479 @@ L_curve25519_bits: sbcs x15, x15, x11 sbcs x16, x16, x12 sbcs x17, x17, x13 + csetm x5, cc mov x3, #-19 - csetm x2, cc # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + extr x5, x5, x17, #63 + mul x3, x5, x3 # Add modulus (if underflow) - adds x14, x14, x3 - adcs x15, x15, x2 - adcs x16, x16, x2 - adc x17, x17, x4 + subs x14, x14, x3 + sbcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff + sbcs x16, x16, xzr + sbc x17, x17, xzr # Multiply by 121666 mov x5, #0xdb42 movk x5, #1, lsl 16 mul x6, x14, x5 umulh x7, x14, x5 mul x3, x15, x5 - umulh x4, x15, x5 + umulh x8, x15, x5 adds x7, x7, x3 - adc x8, xzr, x4 + adc x8, x8, xzr mul x3, x16, x5 - umulh x4, x16, x5 + umulh x9, x16, x5 adds x8, x8, x3 - adc x9, xzr, x4 + adc x9, x9, xzr mul x3, x17, x5 umulh x4, x17, x5 adds x9, x9, x3 - adc x4, xzr, x4 + adc x4, x4, xzr mov x5, #19 extr x4, x4, x9, #63 mul x4, x4, x5 - and x9, x9, #0x7fffffffffffffff adds x6, x6, x4 adcs x7, x7, xzr + and x9, x9, #0x7fffffffffffffff adcs x8, x8, xzr adc x9, x9, xzr # Add adds x10, x10, x6 adcs x11, x11, x7 adcs x12, x12, x8 - adc x13, x13, x9 - mov x3, #-19 - asr x2, x13, #63 + adcs x13, x13, x9 + cset x5, cs + mov x3, #19 # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + extr x5, x5, x13, #63 + mul x3, x5, x3 # Sub modulus (if overflow) - subs x10, x10, x3 - sbcs x11, x11, x2 - sbcs x12, x12, x2 - sbc x13, x13, x4 + adds x10, x10, x3 + adcs x11, x11, xzr + and x13, x13, #0x7fffffffffffffff + adcs x12, x12, xzr + adc x13, x13, xzr # Multiply - # A[0] * B[0] - mul x6, x14, x10 + # A[0] * B[0] umulh x7, x14, x10 - # A[0] * B[1] - mul x3, x14, x11 - umulh x8, x14, x11 - adds x7, x7, x3 - adc x8, x8, xzr - # A[1] * B[0] + mul x6, x14, x10 + # A[2] * B[0] + umulh x9, x16, x10 + mul x8, x16, x10 + # A[1] * B[0] mul x3, x15, x10 + adds x7, x7, x3 umulh x4, x15, x10 + adcs x8, x8, x4 + adc x9, x9, xzr + # A[1] * B[3] + umulh x26, x15, x13 + mul x25, x15, x13 + # A[0] * B[1] + mul x3, x14, x11 adds x7, x7, x3 + umulh x4, x14, x11 adcs x8, x8, x4 - adc x9, xzr, xzr - # A[0] * B[2] - mul x3, x14, x12 - umulh x4, x14, x12 - adds x8, x8, x3 - adc x9, x9, x4 - # A[1] * B[1] - mul x3, x15, x11 - umulh x4, x15, x11 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x2, xzr, xzr - # A[2] * B[0] - mul x3, x16, x10 - umulh x4, x16, x10 - adds x8, x8, x3 - adcs x9, x9, x4 - adc x2, x2, xzr - # A[0] * B[3] - mul x3, x14, x13 - umulh x4, x14, x13 - adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, xzr, xzr - # A[1] * B[2] - mul x3, x15, x12 - umulh x4, x15, x12 - adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[2] * B[1] + # A[2] * B[1] mul x3, x16, x11 + adcs x9, x9, x3 umulh x4, x16, x11 - adds x9, x9, x3 - adcs x2, x2, x4 + adcs x25, x25, x4 adc x26, x26, xzr - # A[3] * B[0] - mul x3, x17, x10 - umulh x4, x17, x10 + # A[1] * B[2] + mul x3, x15, x12 adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[1] * B[3] - mul x3, x15, x13 - umulh x4, x15, x13 - adds x2, x2, x3 - adcs x26, x26, x4 + umulh x4, x15, x12 + adcs x25, x25, x4 + adcs x26, x26, xzr adc x27, xzr, xzr - # A[2] * B[2] - mul x3, x16, x12 - umulh x4, x16, x12 - adds x2, x2, x3 - adcs x26, x26, x4 + # A[0] * B[2] + mul x3, x14, x12 + adds x8, x8, x3 + umulh x4, x14, x12 + adcs x9, x9, x4 + adcs x25, x25, xzr + adcs x26, x26, xzr adc x27, x27, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x3, x15, x11 + adds x8, x8, x3 + umulh x4, x15, x11 + adcs x9, x9, x4 + # A[3] * B[1] mul x3, x17, x11 + adcs x25, x25, x3 umulh x4, x17, x11 - adds x2, x2, x3 adcs x26, x26, x4 adc x27, x27, xzr - # A[2] * B[3] + # A[2] * B[2] + mul x3, x16, x12 + adds x25, x25, x3 + umulh x4, x16, x12 + adcs x26, x26, x4 + # A[3] * B[3] + mul x3, x17, x13 + adcs x27, x27, x3 + umulh x28, x17, x13 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x14, x13 + adds x9, x9, x3 + umulh x4, x14, x13 + adcs x25, x25, x4 + # A[2] * B[3] mul x3, x16, x13 + adcs x26, x26, x3 umulh x4, x16, x13 - adds x26, x26, x3 adcs x27, x27, x4 - adc x28, xzr, xzr - # A[3] * B[2] + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x17, x10 + adds x9, x9, x3 + umulh x4, x17, x10 + adcs x25, x25, x4 + # A[3] * B[2] mul x3, x17, x12 + adcs x26, x26, x3 umulh x4, x17, x12 - adds x26, x26, x3 adcs x27, x27, x4 adc x28, x28, xzr - # A[3] * B[3] - mul x3, x17, x13 - umulh x4, x17, x13 - adds x27, x27, x3 - adc x28, x28, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x28, x28, x27, #63 - extr x27, x27, x26, #63 - extr x26, x26, x2, #63 - extr x2, x2, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x2 - umulh x2, x3, x2 - adds x6, x6, x4 - mul x4, x3, x26 - umulh x26, x3, x26 - adcs x7, x7, x4 - mul x4, x3, x27 - umulh x27, x3, x27 - adcs x8, x8, x4 + mov x3, #38 mul x4, x3, x28 + adds x9, x9, x4 umulh x5, x3, x28 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x2 - adcs x8, x8, x26 - adcs x9, x9, x27 adc x5, x5, xzr - # Overflow + mov x3, #19 extr x5, x5, x9, #63 mul x5, x5, x3 and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr + mov x3, #38 + mul x4, x3, x25 + adds x6, x6, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x7, x7, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x8, x8, x4 + umulh x27, x3, x27 adc x9, x9, xzr - # Reduce if top bit set - and x5, x3, x9, asr 63 - and x9, x9, #0x7fffffffffffffff + # Add high product results in adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr + adcs x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, x27 # Store stp x6, x7, [x29, #16] stp x8, x9, [x29, #32] # Add - ldp x6, x7, [x29, #112] - ldp x8, x9, [x29, #128] - adds x10, x6, x19 - adcs x11, x7, x20 - adcs x12, x8, x21 - adc x13, x9, x22 - mov x3, #-19 - asr x2, x13, #63 - # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + ldp x25, x26, [x29, #48] + ldp x27, x28, [x29, #64] + adds x10, x25, x19 + adcs x11, x26, x20 + adcs x12, x27, x21 + adcs x13, x28, x22 + cset x5, cs + mov x3, #19 + extr x5, x5, x13, #63 + mul x3, x5, x3 # Sub modulus (if overflow) - subs x10, x10, x3 - sbcs x11, x11, x2 - sbcs x12, x12, x2 - sbc x13, x13, x4 + adds x10, x10, x3 + adcs x11, x11, xzr + and x13, x13, #0x7fffffffffffffff + adcs x12, x12, xzr + adc x13, x13, xzr # Sub - subs x19, x6, x19 - sbcs x20, x7, x20 - sbcs x21, x8, x21 - sbcs x22, x9, x22 + subs x19, x25, x19 + sbcs x20, x26, x20 + sbcs x21, x27, x21 + sbcs x22, x28, x22 + csetm x5, cc mov x3, #-19 - csetm x2, cc - # Mask the modulus - and x3, x2, x3 - and x4, x2, #0x7fffffffffffffff + extr x5, x5, x22, #63 + mul x3, x5, x3 # Add modulus (if underflow) - adds x19, x19, x3 - adcs x20, x20, x2 - adcs x21, x21, x2 - adc x22, x22, x4 + subs x19, x19, x3 + sbcs x20, x20, xzr + and x22, x22, #0x7fffffffffffffff + sbcs x21, x21, xzr + sbc x22, x22, xzr # Square # A[0] * A[1] - mul x7, x10, x11 umulh x8, x10, x11 + mul x7, x10, x11 + # A[0] * A[3] + umulh x25, x10, x13 + mul x9, x10, x13 # A[0] * A[2] mul x3, x10, x12 - umulh x9, x10, x12 adds x8, x8, x3 - adc x9, x9, xzr - # A[0] * A[3] - mul x3, x10, x13 - umulh x2, x10, x13 - adds x9, x9, x3 - adc x2, x2, xzr + umulh x4, x10, x12 + adcs x9, x9, x4 + # A[1] * A[3] + mul x3, x11, x13 + adcs x25, x25, x3 + umulh x26, x11, x13 + adc x26, x26, xzr # A[1] * A[2] mul x3, x11, x12 - umulh x4, x11, x12 adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, xzr, xzr - # A[1] * A[3] - mul x3, x11, x13 - umulh x4, x11, x13 - adds x2, x2, x3 - adc x26, x26, x4 + umulh x4, x11, x12 + adcs x25, x25, x4 # A[2] * A[3] mul x3, x12, x13 + adcs x26, x26, x3 umulh x27, x12, x13 - adds x26, x26, x3 adc x27, x27, xzr # Double adds x7, x7, x7 adcs x8, x8, x8 adcs x9, x9, x9 - adcs x2, x2, x2 + adcs x25, x25, x25 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr # A[0] * A[0] + umulh x4, x10, x10 mul x6, x10, x10 - umulh x5, x10, x10 # A[1] * A[1] mul x3, x11, x11 + adds x7, x7, x4 umulh x4, x11, x11 - adds x7, x7, x5 adcs x8, x8, x3 - adc x5, x4, xzr # A[2] * A[2] mul x3, x12, x12 + adcs x9, x9, x4 umulh x4, x12, x12 - adds x9, x9, x5 - adcs x2, x2, x3 - adc x5, x4, xzr + adcs x25, x25, x3 # A[3] * A[3] mul x3, x13, x13 + adcs x26, x26, x4 umulh x4, x13, x13 - adds x26, x26, x5 adcs x27, x27, x3 adc x28, x28, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x28, x28, x27, #63 - extr x27, x27, x26, #63 - extr x26, x26, x2, #63 - extr x2, x2, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x2 - umulh x2, x3, x2 - adds x6, x6, x4 - mul x4, x3, x26 - umulh x26, x3, x26 - adcs x7, x7, x4 - mul x4, x3, x27 - umulh x27, x3, x27 - adcs x8, x8, x4 + mov x3, #38 mul x4, x3, x28 + adds x9, x9, x4 umulh x5, x3, x28 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x2 - adcs x8, x8, x26 - adcs x9, x9, x27 adc x5, x5, xzr - # Overflow + mov x3, #19 extr x5, x5, x9, #63 mul x5, x5, x3 and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr + mov x3, #38 + mul x4, x3, x25 + adds x6, x6, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x7, x7, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x8, x8, x4 + umulh x27, x3, x27 adc x9, x9, xzr - # Reduce if top bit set - and x5, x3, x9, asr 63 - and x9, x9, #0x7fffffffffffffff + # Add high product results in adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - stp x6, x7, [x29, #80] - stp x8, x9, [x29, #96] - # Square + adcs x7, x7, x25 + adcs x8, x8, x26 + adc x9, x9, x27 + # Square # A[0] * A[1] - mul x7, x19, x20 - umulh x8, x19, x20 + umulh x16, x19, x20 + mul x15, x19, x20 + # A[0] * A[3] + umulh x25, x19, x22 + mul x17, x19, x22 # A[0] * A[2] mul x3, x19, x21 - umulh x9, x19, x21 - adds x8, x8, x3 - adc x9, x9, xzr - # A[0] * A[3] - mul x3, x19, x22 - umulh x2, x19, x22 - adds x9, x9, x3 - adc x2, x2, xzr + adds x16, x16, x3 + umulh x4, x19, x21 + adcs x17, x17, x4 + # A[1] * A[3] + mul x3, x20, x22 + adcs x25, x25, x3 + umulh x26, x20, x22 + adc x26, x26, xzr # A[1] * A[2] mul x3, x20, x21 + adds x17, x17, x3 umulh x4, x20, x21 - adds x9, x9, x3 - adcs x2, x2, x4 - adc x26, xzr, xzr - # A[1] * A[3] - mul x3, x20, x22 - umulh x4, x20, x22 - adds x2, x2, x3 - adc x26, x26, x4 + adcs x25, x25, x4 # A[2] * A[3] mul x3, x21, x22 + adcs x26, x26, x3 umulh x27, x21, x22 - adds x26, x26, x3 adc x27, x27, xzr # Double - adds x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x2, x2, x2 + adds x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x25, x25, x25 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr # A[0] * A[0] - mul x6, x19, x19 - umulh x5, x19, x19 + umulh x4, x19, x19 + mul x14, x19, x19 # A[1] * A[1] mul x3, x20, x20 + adds x15, x15, x4 umulh x4, x20, x20 - adds x7, x7, x5 - adcs x8, x8, x3 - adc x5, x4, xzr + adcs x16, x16, x3 # A[2] * A[2] mul x3, x21, x21 + adcs x17, x17, x4 umulh x4, x21, x21 - adds x9, x9, x5 - adcs x2, x2, x3 - adc x5, x4, xzr + adcs x25, x25, x3 # A[3] * A[3] mul x3, x22, x22 + adcs x26, x26, x4 umulh x4, x22, x22 - adds x26, x26, x5 adcs x27, x27, x3 adc x28, x28, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x28, x28, x27, #63 - extr x27, x27, x26, #63 - extr x26, x26, x2, #63 - extr x2, x2, x9, #63 - and x9, x9, #0x7fffffffffffffff - # Multiply top half by 19 + mov x3, #38 + mul x4, x3, x28 + adds x17, x17, x4 + umulh x5, x3, x28 + adc x5, x5, xzr mov x3, #19 - mul x4, x3, x2 - umulh x2, x3, x2 - adds x6, x6, x4 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x25 + adds x14, x14, x4 + umulh x25, x3, x25 mul x4, x3, x26 + adcs x15, x15, x4 umulh x26, x3, x26 - adcs x7, x7, x4 mul x4, x3, x27 + adcs x16, x16, x4 umulh x27, x3, x27 - adcs x8, x8, x4 - mul x4, x3, x28 - umulh x5, x3, x28 - adcs x9, x9, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x7, x7, x2 - adcs x8, x8, x26 - adcs x9, x9, x27 - adc x5, x5, xzr - # Overflow - extr x5, x5, x9, #63 - mul x5, x5, x3 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Reduce if top bit set - and x5, x3, x9, asr 63 - and x9, x9, #0x7fffffffffffffff - adds x6, x6, x5 - adcs x7, x7, xzr - adcs x8, x8, xzr - adc x9, x9, xzr - # Store - ldr x2, [x29, #184] + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x25 + adcs x16, x16, x26 + adc x17, x17, x27 # Multiply - ldp x14, x15, [x2] - ldp x16, x17, [x2, #16] - # A[0] * B[0] - mul x10, x14, x6 - umulh x11, x14, x6 - # A[0] * B[1] - mul x3, x14, x7 - umulh x12, x14, x7 + ldp x19, x20, [x2] + ldp x21, x22, [x2, #16] + # A[0] * B[0] + umulh x11, x19, x14 + mul x10, x19, x14 + # A[2] * B[0] + umulh x13, x21, x14 + mul x12, x21, x14 + # A[1] * B[0] + mul x3, x20, x14 adds x11, x11, x3 - adc x12, x12, xzr - # A[1] * B[0] - mul x3, x15, x6 - umulh x4, x15, x6 + umulh x4, x20, x14 + adcs x12, x12, x4 + adc x13, x13, xzr + # A[1] * B[3] + umulh x26, x20, x17 + mul x25, x20, x17 + # A[0] * B[1] + mul x3, x19, x15 adds x11, x11, x3 + umulh x4, x19, x15 adcs x12, x12, x4 - adc x13, xzr, xzr - # A[0] * B[2] - mul x3, x14, x8 - umulh x4, x14, x8 - adds x12, x12, x3 - adc x13, x13, x4 - # A[1] * B[1] - mul x3, x15, x7 - umulh x4, x15, x7 + # A[2] * B[1] + mul x3, x21, x15 + adcs x13, x13, x3 + umulh x4, x21, x15 + adcs x25, x25, x4 + adc x26, x26, xzr + # A[1] * B[2] + mul x3, x20, x16 + adds x13, x13, x3 + umulh x4, x20, x16 + adcs x25, x25, x4 + adcs x26, x26, xzr + adc x27, xzr, xzr + # A[0] * B[2] + mul x3, x19, x16 adds x12, x12, x3 + umulh x4, x19, x16 adcs x13, x13, x4 - adc x2, xzr, xzr - # A[2] * B[0] - mul x3, x16, x6 - umulh x4, x16, x6 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + # A[1] * B[1] + mul x3, x20, x15 adds x12, x12, x3 + umulh x4, x20, x15 adcs x13, x13, x4 - adc x2, x2, xzr - # A[0] * B[3] - mul x3, x14, x9 - umulh x4, x14, x9 - adds x13, x13, x3 - adcs x2, x2, x4 - adc x26, xzr, xzr - # A[1] * B[2] - mul x3, x15, x8 - umulh x4, x15, x8 - adds x13, x13, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[2] * B[1] - mul x3, x16, x7 - umulh x4, x16, x7 - adds x13, x13, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[3] * B[0] - mul x3, x17, x6 - umulh x4, x17, x6 - adds x13, x13, x3 - adcs x2, x2, x4 - adc x26, x26, xzr - # A[1] * B[3] - mul x3, x15, x9 - umulh x4, x15, x9 - adds x2, x2, x3 - adcs x26, x26, x4 - adc x27, xzr, xzr - # A[2] * B[2] - mul x3, x16, x8 - umulh x4, x16, x8 - adds x2, x2, x3 + # A[3] * B[1] + mul x3, x22, x15 + adcs x25, x25, x3 + umulh x4, x22, x15 adcs x26, x26, x4 adc x27, x27, xzr - # A[3] * B[1] - mul x3, x17, x7 - umulh x4, x17, x7 - adds x2, x2, x3 + # A[2] * B[2] + mul x3, x21, x16 + adds x25, x25, x3 + umulh x4, x21, x16 adcs x26, x26, x4 - adc x27, x27, xzr - # A[2] * B[3] - mul x3, x16, x9 - umulh x4, x16, x9 - adds x26, x26, x3 + # A[3] * B[3] + mul x3, x22, x17 + adcs x27, x27, x3 + umulh x28, x22, x17 + adc x28, x28, xzr + # A[0] * B[3] + mul x3, x19, x17 + adds x13, x13, x3 + umulh x4, x19, x17 + adcs x25, x25, x4 + # A[2] * B[3] + mul x3, x21, x17 + adcs x26, x26, x3 + umulh x4, x21, x17 adcs x27, x27, x4 - adc x28, xzr, xzr - # A[3] * B[2] - mul x3, x17, x8 - umulh x4, x17, x8 - adds x26, x26, x3 + adc x28, x28, xzr + # A[3] * B[0] + mul x3, x22, x14 + adds x13, x13, x3 + umulh x4, x22, x14 + adcs x25, x25, x4 + # A[3] * B[2] + mul x3, x22, x16 + adcs x26, x26, x3 + umulh x4, x22, x16 adcs x27, x27, x4 adc x28, x28, xzr - # A[3] * B[3] - mul x3, x17, x9 - umulh x4, x17, x9 - adds x27, x27, x3 - adc x28, x28, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x28, x28, x27, #63 - extr x27, x27, x26, #63 - extr x26, x26, x2, #63 - extr x2, x2, x13, #63 - and x13, x13, #0x7fffffffffffffff - # Multiply top half by 19 - mov x3, #19 - mul x4, x3, x2 - umulh x2, x3, x2 - adds x10, x10, x4 - mul x4, x3, x26 - umulh x26, x3, x26 - adcs x11, x11, x4 - mul x4, x3, x27 - umulh x27, x3, x27 - adcs x12, x12, x4 + mov x3, #38 mul x4, x3, x28 + adds x13, x13, x4 umulh x5, x3, x28 - adcs x13, x13, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x11, x11, x2 - adcs x12, x12, x26 - adcs x13, x13, x27 adc x5, x5, xzr - # Overflow + mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff - adds x10, x10, x5 - adcs x11, x11, xzr - adcs x12, x12, xzr + mov x3, #38 + mul x4, x3, x25 + adds x10, x10, x4 + umulh x25, x3, x25 + mul x4, x3, x26 + adcs x11, x11, x4 + umulh x26, x3, x26 + mul x4, x3, x27 + adcs x12, x12, x4 + umulh x27, x3, x27 adc x13, x13, xzr - # Reduce if top bit set - and x5, x3, x13, asr 63 - and x13, x13, #0x7fffffffffffffff + # Add high product results in adds x10, x10, x5 - adcs x11, x11, xzr - adcs x12, x12, xzr - adc x13, x13, xzr - # Store - stp x10, x11, [x29, #48] - stp x12, x13, [x29, #64] - sub x25, x25, #1 - cmp x25, #0 + adcs x11, x11, x25 + adcs x12, x12, x26 + adc x13, x13, x27 + subs x24, x24, #1 bge L_curve25519_bits - mov x25, #63 - sub x24, x24, #8 - cmp x24, #0 - bge L_curve25519_words # Invert add x0, x29, #48 add x1, x29, #16 @@ -2540,61 +2868,197 @@ L_curve25519_bits: #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x70 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x24, #3 -#ifndef NDEBUG - add x0, x29, #0x70 -#endif /* !NDEBUG */ - add x1, x29, #0x70 + # Loop: 5 times + mov x24, #5 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] L_curve25519_inv_1: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x24, x24, #1 - bcs L_curve25519_inv_1 - add x0, x29, #0x50 + bne L_curve25519_inv_1 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] #ifndef NDEBUG - add x1, x29, #0x70 + add x0, x29, #0x50 #endif /* !NDEBUG */ + add x1, x29, #0x70 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x70 - add x1, x29, #0x50 -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x24, #8 -#ifndef NDEBUG - add x0, x29, #0x70 -#endif /* !NDEBUG */ - add x1, x29, #0x70 + # Loop: 10 times + mov x24, #10 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] L_curve25519_inv_2: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x24, x24, #1 - bcs L_curve25519_inv_2 -#ifndef NDEBUG + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_inv_2 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] add x0, x29, #0x70 -#endif /* !NDEBUG */ #ifndef NDEBUG add x1, x29, #0x70 #endif /* !NDEBUG */ @@ -2604,307 +3068,722 @@ L_curve25519_inv_2: #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x90 -#ifndef NDEBUG - add x1, x29, #0x70 -#endif /* !NDEBUG */ -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x24, #18 -#ifndef NDEBUG - add x0, x29, #0x90 -#endif /* !NDEBUG */ - add x1, x29, #0x90 + # Loop: 20 times + mov x24, #20 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] L_curve25519_inv_3: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x24, x24, #1 - bcs L_curve25519_inv_3 - add x0, x29, #0x70 + bne L_curve25519_inv_3 + # Store + stp x6, x7, [x29, #144] + stp x8, x9, [x29, #160] #ifndef NDEBUG - add x1, x29, #0x90 + add x0, x29, #0x70 #endif /* !NDEBUG */ + add x1, x29, #0x90 add x2, x29, #0x70 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - mov x24, #9 -#ifndef NDEBUG - add x0, x29, #0x70 -#endif /* !NDEBUG */ - add x1, x29, #0x70 + # Loop: 10 times + mov x24, #10 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] L_curve25519_inv_4: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x24, x24, #1 - bcs L_curve25519_inv_4 + bne L_curve25519_inv_4 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] add x0, x29, #0x50 -#ifndef NDEBUG add x1, x29, #0x70 -#endif /* !NDEBUG */ add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x70 - add x1, x29, #0x50 -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x24, #48 -#ifndef NDEBUG - add x0, x29, #0x70 -#endif /* !NDEBUG */ - add x1, x29, #0x70 + # Loop: 50 times + mov x24, #50 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] L_curve25519_inv_5: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x24, x24, #1 - bcs L_curve25519_inv_5 -#ifndef NDEBUG - add x0, x29, #0x70 -#endif /* !NDEBUG */ -#ifndef NDEBUG - add x1, x29, #0x70 -#endif /* !NDEBUG */ - add x2, x29, #0x50 -#ifndef __APPLE__ - bl fe_mul -#else - bl _fe_mul -#endif /* __APPLE__ */ - add x0, x29, #0x90 -#ifndef NDEBUG - add x1, x29, #0x70 -#endif /* !NDEBUG */ -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x24, #0x62 -#ifndef NDEBUG - add x0, x29, #0x90 -#endif /* !NDEBUG */ - add x1, x29, #0x90 -L_curve25519_inv_6: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x24, x24, #1 - bcs L_curve25519_inv_6 + bne L_curve25519_inv_5 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] add x0, x29, #0x70 #ifndef NDEBUG - add x1, x29, #0x90 + add x1, x29, #0x70 #endif /* !NDEBUG */ - add x2, x29, #0x70 + add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - mov x24, #49 + # Loop: 100 times + mov x24, #0x64 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] +L_curve25519_inv_6: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_inv_6 + # Store + stp x6, x7, [x29, #144] + stp x8, x9, [x29, #160] #ifndef NDEBUG add x0, x29, #0x70 #endif /* !NDEBUG */ - add x1, x29, #0x70 -L_curve25519_inv_7: + add x1, x29, #0x90 + add x2, x29, #0x70 #ifndef __APPLE__ - bl fe_sq + bl fe_mul #else - bl _fe_sq + bl _fe_mul #endif /* __APPLE__ */ + # Loop: 50 times + mov x24, #50 + ldp x6, x7, [x29, #112] + ldp x8, x9, [x29, #128] +L_curve25519_inv_7: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x24, x24, #1 - bcs L_curve25519_inv_7 + bne L_curve25519_inv_7 + # Store + stp x6, x7, [x29, #112] + stp x8, x9, [x29, #128] add x0, x29, #0x50 -#ifndef NDEBUG add x1, x29, #0x70 -#endif /* !NDEBUG */ add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - mov x24, #4 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ - add x1, x29, #0x50 + # Loop: 5 times + mov x24, #5 + ldp x6, x7, [x29, #80] + ldp x8, x9, [x29, #96] L_curve25519_inv_8: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x24, x24, #1 - bcs L_curve25519_inv_8 - add x0, x29, #16 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ - add x2, x29, #48 -#ifndef __APPLE__ - bl fe_mul -#else - bl _fe_mul -#endif /* __APPLE__ */ - ldr x0, [x29, #176] - # Multiply - ldp x6, x7, [x0] - ldp x8, x9, [x0, #16] - ldp x10, x11, [x29, #16] - ldp x12, x13, [x29, #32] - # A[0] * B[0] - mul x14, x6, x10 - umulh x15, x6, x10 - # A[0] * B[1] - mul x3, x6, x11 - umulh x16, x6, x11 - adds x15, x15, x3 - adc x16, x16, xzr - # A[1] * B[0] - mul x3, x7, x10 + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x24, x24, #1 + bne L_curve25519_inv_8 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] + add x0, x29, #16 + add x1, x29, #0x50 + add x2, x29, #48 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + ldr x0, [x29, #176] + # Multiply + ldp x6, x7, [x0] + ldp x8, x9, [x0, #16] + ldp x10, x11, [x29, #16] + ldp x12, x13, [x29, #32] + # A[0] * B[0] + umulh x15, x6, x10 + mul x14, x6, x10 + # A[2] * B[0] + umulh x17, x8, x10 + mul x16, x8, x10 + # A[1] * B[0] + mul x3, x7, x10 + adds x15, x15, x3 umulh x4, x7, x10 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x7, x13 + mul x19, x7, x13 + # A[0] * B[1] + mul x3, x6, x11 adds x15, x15, x3 + umulh x4, x6, x11 adcs x16, x16, x4 - adc x17, xzr, xzr - # A[0] * B[2] - mul x3, x6, x12 - umulh x4, x6, x12 - adds x16, x16, x3 - adc x17, x17, x4 - # A[1] * B[1] - mul x3, x7, x11 - umulh x4, x7, x11 - adds x16, x16, x3 - adcs x17, x17, x4 - adc x19, xzr, xzr - # A[2] * B[0] - mul x3, x8, x10 - umulh x4, x8, x10 - adds x16, x16, x3 - adcs x17, x17, x4 - adc x19, x19, xzr - # A[0] * B[3] - mul x3, x6, x13 - umulh x4, x6, x13 - adds x17, x17, x3 - adcs x19, x19, x4 - adc x20, xzr, xzr - # A[1] * B[2] - mul x3, x7, x12 - umulh x4, x7, x12 - adds x17, x17, x3 - adcs x19, x19, x4 - adc x20, x20, xzr - # A[2] * B[1] + # A[2] * B[1] mul x3, x8, x11 + adcs x17, x17, x3 umulh x4, x8, x11 - adds x17, x17, x3 adcs x19, x19, x4 adc x20, x20, xzr - # A[3] * B[0] - mul x3, x9, x10 - umulh x4, x9, x10 + # A[1] * B[2] + mul x3, x7, x12 adds x17, x17, x3 + umulh x4, x7, x12 adcs x19, x19, x4 - adc x20, x20, xzr - # A[1] * B[3] - mul x3, x7, x13 - umulh x4, x7, x13 - adds x19, x19, x3 - adcs x20, x20, x4 + adcs x20, x20, xzr adc x21, xzr, xzr - # A[2] * B[2] - mul x3, x8, x12 - umulh x4, x8, x12 - adds x19, x19, x3 - adcs x20, x20, x4 + # A[0] * B[2] + mul x3, x6, x12 + adds x16, x16, x3 + umulh x4, x6, x12 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr adc x21, x21, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x3, x7, x11 + adds x16, x16, x3 + umulh x4, x7, x11 + adcs x17, x17, x4 + # A[3] * B[1] mul x3, x9, x11 + adcs x19, x19, x3 umulh x4, x9, x11 - adds x19, x19, x3 adcs x20, x20, x4 adc x21, x21, xzr - # A[2] * B[3] + # A[2] * B[2] + mul x3, x8, x12 + adds x19, x19, x3 + umulh x4, x8, x12 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x9, x13 + adcs x21, x21, x3 + umulh x22, x9, x13 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x6, x13 + adds x17, x17, x3 + umulh x4, x6, x13 + adcs x19, x19, x4 + # A[2] * B[3] mul x3, x8, x13 + adcs x20, x20, x3 umulh x4, x8, x13 - adds x20, x20, x3 adcs x21, x21, x4 - adc x22, xzr, xzr - # A[3] * B[2] + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x9, x10 + adds x17, x17, x3 + umulh x4, x9, x10 + adcs x19, x19, x4 + # A[3] * B[2] mul x3, x9, x12 + adcs x20, x20, x3 umulh x4, x9, x12 - adds x20, x20, x3 adcs x21, x21, x4 adc x22, x22, xzr - # A[3] * B[3] - mul x3, x9, x13 - umulh x4, x9, x13 - adds x21, x21, x3 - adc x22, x22, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x22, x22, x21, #63 - extr x21, x21, x20, #63 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - and x17, x17, #0x7fffffffffffffff - # Multiply top half by 19 + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 mul x4, x3, x19 - umulh x19, x3, x19 adds x14, x14, x4 + umulh x19, x3, x19 mul x4, x3, x20 - umulh x20, x3, x20 adcs x15, x15, x4 + umulh x20, x3, x20 mul x4, x3, x21 - umulh x21, x3, x21 adcs x16, x16, x4 - mul x4, x3, x22 - umulh x5, x3, x22 - adcs x17, x17, x4 - adc x5, x5, xzr - # Add remaining product results in - adds x15, x15, x19 - adcs x16, x16, x20 - adcs x17, x17, x21 - adc x5, x5, xzr - # Overflow - extr x5, x5, x17, #63 - mul x5, x5, x3 - and x17, x17, #0x7fffffffffffffff - adds x14, x14, x5 - adcs x15, x15, xzr - adcs x16, x16, xzr + umulh x21, x3, x21 adc x17, x17, xzr - # Reduce if top bit set - and x5, x3, x17, asr 63 - and x17, x17, #0x7fffffffffffffff + # Add high product results in adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 + # Reduce if top bit set + mov x3, #19 + and x4, x3, x17, asr 63 + adds x14, x14, x4 adcs x15, x15, xzr + and x17, x17, #0x7fffffffffffffff adcs x16, x16, xzr adc x17, x17, xzr adds x4, x14, x3 @@ -2934,6 +3813,7 @@ L_curve25519_inv_8: #ifndef __APPLE__ .size curve25519,.-curve25519 #endif /* __APPLE__ */ +#ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_pow22523 @@ -2948,6 +3828,7 @@ _fe_pow22523: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 + str x17, [x29, #128] str x23, [x29, #136] # pow22523 str x0, [x29, #112] @@ -3016,114 +3897,99 @@ _fe_pow22523: #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #48 - add x1, x29, #16 -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x23, #3 -#ifndef NDEBUG - add x0, x29, #48 -#endif /* !NDEBUG */ - add x1, x29, #48 + # Loop: 5 times + mov x23, #5 + ldp x6, x7, [x29, #16] + ldp x8, x9, [x29, #32] L_fe_pow22523_1: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x23, x23, #1 - bcs L_fe_pow22523_1 - add x0, x29, #16 -#ifndef NDEBUG - add x1, x29, #48 -#endif /* !NDEBUG */ - add x2, x29, #16 -#ifndef __APPLE__ - bl fe_mul -#else - bl _fe_mul -#endif /* __APPLE__ */ - add x0, x29, #48 - add x1, x29, #16 -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x23, #8 -#ifndef NDEBUG - add x0, x29, #48 -#endif /* !NDEBUG */ - add x1, x29, #48 -L_fe_pow22523_2: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x23, x23, #1 - bcs L_fe_pow22523_2 -#ifndef NDEBUG - add x0, x29, #48 -#endif /* !NDEBUG */ -#ifndef NDEBUG - add x1, x29, #48 -#endif /* !NDEBUG */ - add x2, x29, #16 -#ifndef __APPLE__ - bl fe_mul -#else - bl _fe_mul -#endif /* __APPLE__ */ - add x0, x29, #0x50 -#ifndef NDEBUG - add x1, x29, #48 -#endif /* !NDEBUG */ -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x23, #18 -#ifndef NDEBUG - add x0, x29, #0x50 -#endif /* !NDEBUG */ - add x1, x29, #0x50 -L_fe_pow22523_3: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x23, x23, #1 - bcs L_fe_pow22523_3 - add x0, x29, #48 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ - add x2, x29, #48 -#ifndef __APPLE__ - bl fe_mul -#else - bl _fe_mul -#endif /* __APPLE__ */ - mov x23, #9 + bne L_fe_pow22523_1 + # Store + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] #ifndef NDEBUG - add x0, x29, #48 -#endif /* !NDEBUG */ - add x1, x29, #48 -L_fe_pow22523_4: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - subs x23, x23, #1 - bcs L_fe_pow22523_4 add x0, x29, #16 +#endif /* !NDEBUG */ #ifndef NDEBUG add x1, x29, #48 #endif /* !NDEBUG */ @@ -3133,29 +3999,97 @@ L_fe_pow22523_4: #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #48 - add x1, x29, #16 -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x23, #48 -#ifndef NDEBUG - add x0, x29, #48 -#endif /* !NDEBUG */ - add x1, x29, #48 -L_fe_pow22523_5: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Loop: 10 times + mov x23, #10 + ldp x6, x7, [x29, #16] + ldp x8, x9, [x29, #32] +L_fe_pow22523_2: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x23, x23, #1 - bcs L_fe_pow22523_5 -#ifndef NDEBUG + bne L_fe_pow22523_2 + # Store + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] add x0, x29, #48 -#endif /* !NDEBUG */ #ifndef NDEBUG add x1, x29, #48 #endif /* !NDEBUG */ @@ -3165,52 +4099,295 @@ L_fe_pow22523_5: #else bl _fe_mul #endif /* __APPLE__ */ - add x0, x29, #0x50 -#ifndef NDEBUG - add x1, x29, #48 -#endif /* !NDEBUG */ -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ - mov x23, #0x62 + # Loop: 20 times + mov x23, #20 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] +L_fe_pow22523_3: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x23, x23, #1 + bne L_fe_pow22523_3 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] #ifndef NDEBUG - add x0, x29, #0x50 + add x0, x29, #48 #endif /* !NDEBUG */ add x1, x29, #0x50 -L_fe_pow22523_6: + add x2, x29, #48 #ifndef __APPLE__ - bl fe_sq + bl fe_mul #else - bl _fe_sq + bl _fe_mul #endif /* __APPLE__ */ - subs x23, x23, #1 - bcs L_fe_pow22523_6 - add x0, x29, #48 -#ifndef NDEBUG - add x1, x29, #0x50 -#endif /* !NDEBUG */ - add x2, x29, #48 + # Loop: 10 times + mov x23, #10 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] +L_fe_pow22523_4: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x23, x23, #1 + bne L_fe_pow22523_4 + # Store + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] + add x0, x29, #16 + add x1, x29, #48 + add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - mov x23, #49 -#ifndef NDEBUG - add x0, x29, #48 -#endif /* !NDEBUG */ - add x1, x29, #48 -L_fe_pow22523_7: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Loop: 50 times + mov x23, #50 + ldp x6, x7, [x29, #16] + ldp x8, x9, [x29, #32] +L_fe_pow22523_5: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x23, x23, #1 - bcs L_fe_pow22523_7 - add x0, x29, #16 + bne L_fe_pow22523_5 + # Store + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] + add x0, x29, #48 #ifndef NDEBUG add x1, x29, #48 #endif /* !NDEBUG */ @@ -3220,2782 +4397,2064 @@ L_fe_pow22523_7: #else bl _fe_mul #endif /* __APPLE__ */ - mov x23, #1 -#ifndef NDEBUG - add x0, x29, #16 -#endif /* !NDEBUG */ - add x1, x29, #16 -L_fe_pow22523_8: -#ifndef __APPLE__ - bl fe_sq -#else - bl _fe_sq -#endif /* __APPLE__ */ + # Loop: 100 times + mov x23, #0x64 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] +L_fe_pow22523_6: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 + # Reduce + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 subs x23, x23, #1 - bcs L_fe_pow22523_8 - ldr x0, [x29, #112] + bne L_fe_pow22523_6 + # Store + stp x6, x7, [x29, #80] + stp x8, x9, [x29, #96] #ifndef NDEBUG - add x1, x29, #16 + add x0, x29, #48 #endif /* !NDEBUG */ - ldr x2, [x29, #120] + add x1, x29, #0x50 + add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ - ldr x23, [x29, #136] - ldp x29, x30, [sp], #0x90 - ret -#ifndef __APPLE__ - .size fe_pow22523,.-fe_pow22523 -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_to_p2 -.type fe_ge_to_p2,@function -.align 2 -fe_ge_to_p2: -#else -.section __TEXT,__text -.globl _fe_ge_to_p2 -.p2align 2 -_fe_ge_to_p2: -#endif /* __APPLE__ */ - stp x29, x30, [sp, #-112]! - add x29, sp, #0 - str x17, [x29, #72] - str x19, [x29, #80] - stp x20, x21, [x29, #88] - str x22, [x29, #104] - str x1, [x29, #16] - str x2, [x29, #24] - str x3, [x29, #32] - str x4, [x29, #40] - str x5, [x29, #48] - str x6, [x29, #56] - ldr x1, [x29, #32] - ldr x2, [x29, #56] - # Multiply - ldp x11, x12, [x1] - ldp x13, x14, [x1, #16] - ldp x15, x16, [x2] - ldp x17, x19, [x2, #16] - # A[0] * B[0] - mul x3, x11, x15 - umulh x4, x11, x15 - # A[0] * B[1] - mul x20, x11, x16 - umulh x5, x11, x16 - adds x4, x4, x20 - adc x5, x5, xzr - # A[1] * B[0] - mul x20, x12, x15 - umulh x21, x12, x15 - adds x4, x4, x20 - adcs x5, x5, x21 - adc x6, xzr, xzr - # A[0] * B[2] - mul x20, x11, x17 - umulh x21, x11, x17 - adds x5, x5, x20 - adc x6, x6, x21 - # A[1] * B[1] - mul x20, x12, x16 - umulh x21, x12, x16 - adds x5, x5, x20 - adcs x6, x6, x21 - adc x7, xzr, xzr - # A[2] * B[0] - mul x20, x13, x15 - umulh x21, x13, x15 - adds x5, x5, x20 - adcs x6, x6, x21 - adc x7, x7, xzr - # A[0] * B[3] - mul x20, x11, x19 - umulh x21, x11, x19 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, xzr, xzr - # A[1] * B[2] - mul x20, x12, x17 - umulh x21, x12, x17 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[2] * B[1] - mul x20, x13, x16 - umulh x21, x13, x16 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[3] * B[0] - mul x20, x14, x15 - umulh x21, x14, x15 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[1] * B[3] - mul x20, x12, x19 - umulh x21, x12, x19 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, xzr, xzr - # A[2] * B[2] - mul x20, x13, x17 - umulh x21, x13, x17 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, x9, xzr - # A[3] * B[1] - mul x20, x14, x16 - umulh x21, x14, x16 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, x9, xzr - # A[2] * B[3] - mul x20, x13, x19 - umulh x21, x13, x19 - adds x8, x8, x20 - adcs x9, x9, x21 - adc x10, xzr, xzr - # A[3] * B[2] - mul x20, x14, x17 - umulh x21, x14, x17 - adds x8, x8, x20 - adcs x9, x9, x21 - adc x10, x10, xzr - # A[3] * B[3] - mul x20, x14, x19 - umulh x21, x14, x19 - adds x9, x9, x20 - adc x10, x10, x21 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x20, #19 - mul x21, x20, x7 - umulh x7, x20, x7 - adds x3, x3, x21 - mul x21, x20, x8 - umulh x8, x20, x8 - adcs x4, x4, x21 - mul x21, x20, x9 - umulh x9, x20, x9 - adcs x5, x5, x21 - mul x21, x20, x10 - umulh x22, x20, x10 - adcs x6, x6, x21 - adc x22, x22, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x22, x22, xzr - # Overflow - extr x22, x22, x6, #63 - mul x22, x22, x20 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x22 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - and x22, x20, x6, asr 63 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x22 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x0, [x29, #16] - ldr x1, [x29, #40] - ldr x2, [x29, #48] - # Multiply - ldp x11, x12, [x1] - ldp x13, x14, [x1, #16] - ldp x15, x16, [x2] - ldp x17, x19, [x2, #16] - # A[0] * B[0] - mul x3, x11, x15 - umulh x4, x11, x15 - # A[0] * B[1] - mul x20, x11, x16 - umulh x5, x11, x16 - adds x4, x4, x20 - adc x5, x5, xzr - # A[1] * B[0] - mul x20, x12, x15 - umulh x21, x12, x15 - adds x4, x4, x20 - adcs x5, x5, x21 - adc x6, xzr, xzr - # A[0] * B[2] - mul x20, x11, x17 - umulh x21, x11, x17 - adds x5, x5, x20 - adc x6, x6, x21 - # A[1] * B[1] - mul x20, x12, x16 - umulh x21, x12, x16 - adds x5, x5, x20 - adcs x6, x6, x21 - adc x7, xzr, xzr - # A[2] * B[0] - mul x20, x13, x15 - umulh x21, x13, x15 - adds x5, x5, x20 - adcs x6, x6, x21 - adc x7, x7, xzr - # A[0] * B[3] - mul x20, x11, x19 - umulh x21, x11, x19 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, xzr, xzr - # A[1] * B[2] - mul x20, x12, x17 - umulh x21, x12, x17 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[2] * B[1] - mul x20, x13, x16 - umulh x21, x13, x16 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[3] * B[0] - mul x20, x14, x15 - umulh x21, x14, x15 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[1] * B[3] - mul x20, x12, x19 - umulh x21, x12, x19 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, xzr, xzr - # A[2] * B[2] - mul x20, x13, x17 - umulh x21, x13, x17 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, x9, xzr - # A[3] * B[1] - mul x20, x14, x16 - umulh x21, x14, x16 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, x9, xzr - # A[2] * B[3] - mul x20, x13, x19 - umulh x21, x13, x19 - adds x8, x8, x20 - adcs x9, x9, x21 - adc x10, xzr, xzr - # A[3] * B[2] - mul x20, x14, x17 - umulh x21, x14, x17 - adds x8, x8, x20 - adcs x9, x9, x21 - adc x10, x10, xzr - # A[3] * B[3] - mul x20, x14, x19 - umulh x21, x14, x19 - adds x9, x9, x20 - adc x10, x10, x21 + # Loop: 50 times + mov x23, #50 + ldp x6, x7, [x29, #48] + ldp x8, x9, [x29, #64] +L_fe_pow22523_7: + # Square + # A[0] * A[1] + umulh x12, x6, x7 + mul x11, x6, x7 + # A[0] * A[3] + umulh x14, x6, x9 + mul x13, x6, x9 + # A[0] * A[2] + mul x3, x6, x8 + adds x12, x12, x3 + umulh x4, x6, x8 + adcs x13, x13, x4 + # A[1] * A[3] + mul x3, x7, x9 + adcs x14, x14, x3 + umulh x15, x7, x9 + adc x15, x15, xzr + # A[1] * A[2] + mul x3, x7, x8 + adds x13, x13, x3 + umulh x4, x7, x8 + adcs x14, x14, x4 + # A[2] * A[3] + mul x3, x8, x9 + adcs x15, x15, x3 + umulh x16, x8, x9 + adc x16, x16, xzr + # Double + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adc x17, xzr, xzr + # A[0] * A[0] + umulh x4, x6, x6 + mul x10, x6, x6 + # A[1] * A[1] + mul x3, x7, x7 + adds x11, x11, x4 + umulh x4, x7, x7 + adcs x12, x12, x3 + # A[2] * A[2] + mul x3, x8, x8 + adcs x13, x13, x4 + umulh x4, x8, x8 + adcs x14, x14, x3 + # A[3] * A[3] + mul x3, x9, x9 + adcs x15, x15, x4 + umulh x4, x9, x9 + adcs x16, x16, x3 + adc x17, x17, x4 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x20, #19 - mul x21, x20, x7 - umulh x7, x20, x7 - adds x3, x3, x21 - mul x21, x20, x8 - umulh x8, x20, x8 - adcs x4, x4, x21 - mul x21, x20, x9 - umulh x9, x20, x9 - adcs x5, x5, x21 - mul x21, x20, x10 - umulh x22, x20, x10 - adcs x6, x6, x21 - adc x22, x22, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x22, x22, xzr - # Overflow - extr x22, x22, x6, #63 - mul x22, x22, x20 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x22 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - and x22, x20, x6, asr 63 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x22 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x0, [x29, #24] - ldr x2, [x29, #56] - # Multiply - ldp x11, x12, [x2] - ldp x13, x14, [x2, #16] - # A[0] * B[0] - mul x3, x15, x11 - umulh x4, x15, x11 - # A[0] * B[1] - mul x20, x15, x12 - umulh x5, x15, x12 - adds x4, x4, x20 + mov x3, #38 + mul x4, x3, x17 + adds x13, x13, x4 + umulh x5, x3, x17 adc x5, x5, xzr - # A[1] * B[0] - mul x20, x16, x11 - umulh x21, x16, x11 - adds x4, x4, x20 - adcs x5, x5, x21 - adc x6, xzr, xzr - # A[0] * B[2] - mul x20, x15, x13 - umulh x21, x15, x13 - adds x5, x5, x20 - adc x6, x6, x21 - # A[1] * B[1] - mul x20, x16, x12 - umulh x21, x16, x12 - adds x5, x5, x20 - adcs x6, x6, x21 - adc x7, xzr, xzr - # A[2] * B[0] - mul x20, x17, x11 - umulh x21, x17, x11 - adds x5, x5, x20 - adcs x6, x6, x21 - adc x7, x7, xzr - # A[0] * B[3] - mul x20, x15, x14 - umulh x21, x15, x14 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, xzr, xzr - # A[1] * B[2] - mul x20, x16, x13 - umulh x21, x16, x13 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[2] * B[1] - mul x20, x17, x12 - umulh x21, x17, x12 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[3] * B[0] - mul x20, x19, x11 - umulh x21, x19, x11 - adds x6, x6, x20 - adcs x7, x7, x21 - adc x8, x8, xzr - # A[1] * B[3] - mul x20, x16, x14 - umulh x21, x16, x14 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, xzr, xzr - # A[2] * B[2] - mul x20, x17, x13 - umulh x21, x17, x13 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, x9, xzr - # A[3] * B[1] - mul x20, x19, x12 - umulh x21, x19, x12 - adds x7, x7, x20 - adcs x8, x8, x21 - adc x9, x9, xzr - # A[2] * B[3] - mul x20, x17, x14 - umulh x21, x17, x14 - adds x8, x8, x20 - adcs x9, x9, x21 - adc x10, xzr, xzr - # A[3] * B[2] - mul x20, x19, x13 - umulh x21, x19, x13 - adds x8, x8, x20 - adcs x9, x9, x21 - adc x10, x10, xzr - # A[3] * B[3] - mul x20, x19, x14 - umulh x21, x19, x14 - adds x9, x9, x20 - adc x10, x10, x21 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x20, #19 - mul x21, x20, x7 - umulh x7, x20, x7 - adds x3, x3, x21 - mul x21, x20, x8 - umulh x8, x20, x8 - adcs x4, x4, x21 - mul x21, x20, x9 - umulh x9, x20, x9 - adcs x5, x5, x21 - mul x21, x20, x10 - umulh x22, x20, x10 - adcs x6, x6, x21 - adc x22, x22, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x22, x22, xzr - # Overflow - extr x22, x22, x6, #63 - mul x22, x22, x20 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x22 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - and x22, x20, x6, asr 63 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x22 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr + mov x3, #19 + extr x5, x5, x13, #63 + mul x5, x5, x3 + and x13, x13, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x14 + adds x10, x10, x4 + umulh x14, x3, x14 + mul x4, x3, x15 + adcs x11, x11, x4 + umulh x15, x3, x15 + mul x4, x3, x16 + adcs x12, x12, x4 + umulh x16, x3, x16 + adc x13, x13, xzr + # Add high product results in + adds x6, x10, x5 + adcs x7, x11, x14 + adcs x8, x12, x15 + adc x9, x13, x16 + subs x23, x23, #1 + bne L_fe_pow22523_7 # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x17, [x29, #72] - ldr x19, [x29, #80] - ldp x20, x21, [x29, #88] - ldr x22, [x29, #104] - ldp x29, x30, [sp], #0x70 + stp x6, x7, [x29, #48] + stp x8, x9, [x29, #64] + add x0, x29, #16 + add x1, x29, #48 + add x2, x29, #16 +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ +#ifndef NDEBUG + add x0, x29, #16 +#endif /* !NDEBUG */ + add x1, x29, #16 +#ifndef __APPLE__ + bl fe_sq +#else + bl _fe_sq +#endif /* __APPLE__ */ +#ifndef __APPLE__ + bl fe_sq +#else + bl _fe_sq +#endif /* __APPLE__ */ + ldr x0, [x29, #112] +#ifndef NDEBUG + add x1, x29, #16 +#endif /* !NDEBUG */ + ldr x2, [x29, #120] +#ifndef __APPLE__ + bl fe_mul +#else + bl _fe_mul +#endif /* __APPLE__ */ + ldr x17, [x29, #128] + ldr x23, [x29, #136] + ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ - .size fe_ge_to_p2,.-fe_ge_to_p2 + .size fe_pow22523,.-fe_pow22523 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_to_p3 -.type fe_ge_to_p3,@function +.globl ge_p1p1_to_p2 +.type ge_p1p1_to_p2,@function .align 2 -fe_ge_to_p3: +ge_p1p1_to_p2: #else .section __TEXT,__text -.globl _fe_ge_to_p3 +.globl _ge_p1p1_to_p2 .p2align 2 -_fe_ge_to_p3: +_ge_p1p1_to_p2: #endif /* __APPLE__ */ - stp x29, x30, [sp, #-160]! + stp x29, x30, [sp, #-80]! add x29, sp, #0 - str x17, [x29, #88] - str x19, [x29, #96] - stp x20, x21, [x29, #104] - stp x22, x23, [x29, #120] - stp x24, x25, [x29, #136] - str x26, [x29, #152] - str x1, [x29, #16] - str x2, [x29, #24] - str x3, [x29, #32] - str x4, [x29, #40] - str x5, [x29, #48] - str x6, [x29, #56] - str x7, [x29, #64] - ldr x1, [x29, #40] - ldr x2, [x29, #64] + str x17, [x29, #40] + str x19, [x29, #48] + stp x20, x21, [x29, #56] + str x22, [x29, #72] + str x0, [x29, #16] + str x1, [x29, #24] + mov x2, x1 + add x1, x1, #0x60 # Multiply - ldp x11, x12, [x1] - ldp x13, x14, [x1, #16] - ldp x15, x16, [x2] - ldp x17, x19, [x2, #16] - # A[0] * B[0] - mul x3, x11, x15 - umulh x4, x11, x15 - # A[0] * B[1] - mul x24, x11, x16 - umulh x5, x11, x16 - adds x4, x4, x24 - adc x5, x5, xzr - # A[1] * B[0] - mul x24, x12, x15 - umulh x25, x12, x15 - adds x4, x4, x24 - adcs x5, x5, x25 - adc x6, xzr, xzr - # A[0] * B[2] - mul x24, x11, x17 - umulh x25, x11, x17 - adds x5, x5, x24 - adc x6, x6, x25 - # A[1] * B[1] - mul x24, x12, x16 - umulh x25, x12, x16 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, xzr, xzr - # A[2] * B[0] - mul x24, x13, x15 - umulh x25, x13, x15 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, x7, xzr - # A[0] * B[3] - mul x24, x11, x19 - umulh x25, x11, x19 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, xzr, xzr - # A[1] * B[2] - mul x24, x12, x17 - umulh x25, x12, x17 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[2] * B[1] - mul x24, x13, x16 - umulh x25, x13, x16 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[3] * B[0] - mul x24, x14, x15 - umulh x25, x14, x15 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[1] * B[3] - mul x24, x12, x19 - umulh x25, x12, x19 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, xzr, xzr - # A[2] * B[2] - mul x24, x13, x17 - umulh x25, x13, x17 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[3] * B[1] - mul x24, x14, x16 - umulh x25, x14, x16 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[2] * B[3] - mul x24, x13, x19 - umulh x25, x13, x19 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, xzr, xzr - # A[3] * B[2] - mul x24, x14, x17 - umulh x25, x14, x17 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, x10, xzr - # A[3] * B[3] - mul x24, x14, x19 - umulh x25, x14, x19 - adds x9, x9, x24 - adc x10, x10, x25 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x7 - umulh x7, x24, x7 - adds x3, x3, x25 - mul x25, x24, x8 - umulh x8, x24, x8 - adcs x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x26, x24, x10 - adcs x6, x6, x25 - adc x26, x26, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x26, x26, xzr - # Overflow - extr x26, x26, x6, #63 - mul x26, x26, x24 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - and x26, x24, x6, asr 63 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x0, [x29, #32] - ldr x2, [x29, #48] - # Multiply - ldp x20, x21, [x2] - ldp x22, x23, [x2, #16] - # A[0] * B[0] - mul x3, x11, x20 - umulh x4, x11, x20 - # A[0] * B[1] - mul x24, x11, x21 - umulh x5, x11, x21 - adds x4, x4, x24 - adc x5, x5, xzr - # A[1] * B[0] - mul x24, x12, x20 - umulh x25, x12, x20 - adds x4, x4, x24 - adcs x5, x5, x25 - adc x6, xzr, xzr - # A[0] * B[2] - mul x24, x11, x22 - umulh x25, x11, x22 - adds x5, x5, x24 - adc x6, x6, x25 - # A[1] * B[1] - mul x24, x12, x21 - umulh x25, x12, x21 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, xzr, xzr - # A[2] * B[0] - mul x24, x13, x20 - umulh x25, x13, x20 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, x7, xzr - # A[0] * B[3] - mul x24, x11, x23 - umulh x25, x11, x23 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, xzr, xzr - # A[1] * B[2] - mul x24, x12, x22 - umulh x25, x12, x22 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[2] * B[1] - mul x24, x13, x21 - umulh x25, x13, x21 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[3] * B[0] - mul x24, x14, x20 - umulh x25, x14, x20 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[1] * B[3] - mul x24, x12, x23 - umulh x25, x12, x23 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, xzr, xzr - # A[2] * B[2] - mul x24, x13, x22 - umulh x25, x13, x22 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[3] * B[1] - mul x24, x14, x21 - umulh x25, x14, x21 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[2] * B[3] - mul x24, x13, x23 - umulh x25, x13, x23 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, xzr, xzr - # A[3] * B[2] - mul x24, x14, x22 - umulh x25, x14, x22 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, x10, xzr - # A[3] * B[3] - mul x24, x14, x23 - umulh x25, x14, x23 - adds x9, x9, x24 - adc x10, x10, x25 + ldp x10, x11, [x1] + ldp x12, x13, [x1, #16] + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + # A[0] * B[0] + umulh x15, x10, x6 + mul x14, x10, x6 + # A[2] * B[0] + umulh x17, x12, x6 + mul x16, x12, x6 + # A[1] * B[0] + mul x3, x11, x6 + adds x15, x15, x3 + umulh x4, x11, x6 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x11, x9 + mul x19, x11, x9 + # A[0] * B[1] + mul x3, x10, x7 + adds x15, x15, x3 + umulh x4, x10, x7 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x12, x7 + adcs x17, x17, x3 + umulh x4, x12, x7 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[2] + mul x3, x11, x8 + adds x17, x17, x3 + umulh x4, x11, x8 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x10, x8 + adds x16, x16, x3 + umulh x4, x10, x8 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # A[1] * B[1] + mul x3, x11, x7 + adds x16, x16, x3 + umulh x4, x11, x7 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x13, x7 + adcs x19, x19, x3 + umulh x4, x13, x7 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x12, x8 + adds x19, x19, x3 + umulh x4, x12, x8 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x13, x9 + adcs x21, x21, x3 + umulh x22, x13, x9 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x10, x9 + adds x17, x17, x3 + umulh x4, x10, x9 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x12, x9 + adcs x20, x20, x3 + umulh x4, x12, x9 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x13, x6 + adds x17, x17, x3 + umulh x4, x13, x6 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x13, x8 + adcs x20, x20, x3 + umulh x4, x13, x8 + adcs x21, x21, x4 + adc x22, x22, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x7 - umulh x7, x24, x7 - adds x3, x3, x25 - mul x25, x24, x8 - umulh x8, x24, x8 - adcs x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x26, x24, x10 - adcs x6, x6, x25 - adc x26, x26, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x26, x26, xzr - # Overflow - extr x26, x26, x6, #63 - mul x26, x26, x24 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - and x26, x24, x6, asr 63 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x0, [x29, #16] - ldr x2, [x29, #56] + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + sub x2, x1, #32 + add x0, x0, #0x40 # Multiply - ldp x11, x12, [x2] - ldp x13, x14, [x2, #16] - # A[0] * B[0] - mul x3, x20, x11 - umulh x4, x20, x11 - # A[0] * B[1] - mul x24, x20, x12 - umulh x5, x20, x12 - adds x4, x4, x24 - adc x5, x5, xzr - # A[1] * B[0] - mul x24, x21, x11 - umulh x25, x21, x11 - adds x4, x4, x24 - adcs x5, x5, x25 - adc x6, xzr, xzr - # A[0] * B[2] - mul x24, x20, x13 - umulh x25, x20, x13 - adds x5, x5, x24 - adc x6, x6, x25 - # A[1] * B[1] - mul x24, x21, x12 - umulh x25, x21, x12 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, xzr, xzr - # A[2] * B[0] - mul x24, x22, x11 - umulh x25, x22, x11 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, x7, xzr - # A[0] * B[3] - mul x24, x20, x14 - umulh x25, x20, x14 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, xzr, xzr - # A[1] * B[2] - mul x24, x21, x13 - umulh x25, x21, x13 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[2] * B[1] - mul x24, x22, x12 - umulh x25, x22, x12 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[3] * B[0] - mul x24, x23, x11 - umulh x25, x23, x11 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[1] * B[3] - mul x24, x21, x14 - umulh x25, x21, x14 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, xzr, xzr - # A[2] * B[2] - mul x24, x22, x13 - umulh x25, x22, x13 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[3] * B[1] - mul x24, x23, x12 - umulh x25, x23, x12 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[2] * B[3] - mul x24, x22, x14 - umulh x25, x22, x14 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, xzr, xzr - # A[3] * B[2] - mul x24, x23, x13 - umulh x25, x23, x13 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, x10, xzr - # A[3] * B[3] - mul x24, x23, x14 - umulh x25, x23, x14 - adds x9, x9, x24 - adc x10, x10, x25 + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + # A[0] * B[0] + umulh x15, x10, x6 + mul x14, x10, x6 + # A[2] * B[0] + umulh x17, x12, x6 + mul x16, x12, x6 + # A[1] * B[0] + mul x3, x11, x6 + adds x15, x15, x3 + umulh x4, x11, x6 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x11, x9 + mul x19, x11, x9 + # A[0] * B[1] + mul x3, x10, x7 + adds x15, x15, x3 + umulh x4, x10, x7 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x12, x7 + adcs x17, x17, x3 + umulh x4, x12, x7 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[2] + mul x3, x11, x8 + adds x17, x17, x3 + umulh x4, x11, x8 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x10, x8 + adds x16, x16, x3 + umulh x4, x10, x8 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # A[1] * B[1] + mul x3, x11, x7 + adds x16, x16, x3 + umulh x4, x11, x7 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x13, x7 + adcs x19, x19, x3 + umulh x4, x13, x7 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x12, x8 + adds x19, x19, x3 + umulh x4, x12, x8 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x13, x9 + adcs x21, x21, x3 + umulh x22, x13, x9 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x10, x9 + adds x17, x17, x3 + umulh x4, x10, x9 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x12, x9 + adcs x20, x20, x3 + umulh x4, x12, x9 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x13, x6 + adds x17, x17, x3 + umulh x4, x13, x6 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x13, x8 + adcs x20, x20, x3 + umulh x4, x13, x8 + adcs x21, x21, x4 + adc x22, x22, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x7 - umulh x7, x24, x7 - adds x3, x3, x25 - mul x25, x24, x8 - umulh x8, x24, x8 - adcs x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x26, x24, x10 - adcs x6, x6, x25 - adc x26, x26, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x26, x26, xzr - # Overflow - extr x26, x26, x6, #63 - mul x26, x26, x24 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - and x26, x24, x6, asr 63 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x0, [x29, #24] + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + sub x1, x1, #0x40 + sub x0, x0, #32 # Multiply - # A[0] * B[0] - mul x3, x11, x15 - umulh x4, x11, x15 - # A[0] * B[1] - mul x24, x11, x16 - umulh x5, x11, x16 - adds x4, x4, x24 + ldp x10, x11, [x1] + ldp x12, x13, [x1, #16] + # A[0] * B[0] + umulh x15, x10, x6 + mul x14, x10, x6 + # A[2] * B[0] + umulh x17, x12, x6 + mul x16, x12, x6 + # A[1] * B[0] + mul x3, x11, x6 + adds x15, x15, x3 + umulh x4, x11, x6 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x11, x9 + mul x19, x11, x9 + # A[0] * B[1] + mul x3, x10, x7 + adds x15, x15, x3 + umulh x4, x10, x7 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x12, x7 + adcs x17, x17, x3 + umulh x4, x12, x7 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[2] + mul x3, x11, x8 + adds x17, x17, x3 + umulh x4, x11, x8 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x10, x8 + adds x16, x16, x3 + umulh x4, x10, x8 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # A[1] * B[1] + mul x3, x11, x7 + adds x16, x16, x3 + umulh x4, x11, x7 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x13, x7 + adcs x19, x19, x3 + umulh x4, x13, x7 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x12, x8 + adds x19, x19, x3 + umulh x4, x12, x8 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x13, x9 + adcs x21, x21, x3 + umulh x22, x13, x9 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x10, x9 + adds x17, x17, x3 + umulh x4, x10, x9 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x12, x9 + adcs x20, x20, x3 + umulh x4, x12, x9 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x13, x6 + adds x17, x17, x3 + umulh x4, x13, x6 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x13, x8 + adcs x20, x20, x3 + umulh x4, x13, x8 + adcs x21, x21, x4 + adc x22, x22, xzr + # Reduce + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 adc x5, x5, xzr - # A[1] * B[0] - mul x24, x12, x15 - umulh x25, x12, x15 - adds x4, x4, x24 - adcs x5, x5, x25 - adc x6, xzr, xzr - # A[0] * B[2] - mul x24, x11, x17 - umulh x25, x11, x17 - adds x5, x5, x24 - adc x6, x6, x25 - # A[1] * B[1] - mul x24, x12, x16 - umulh x25, x12, x16 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, xzr, xzr - # A[2] * B[0] - mul x24, x13, x15 - umulh x25, x13, x15 - adds x5, x5, x24 - adcs x6, x6, x25 - adc x7, x7, xzr - # A[0] * B[3] - mul x24, x11, x19 - umulh x25, x11, x19 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, xzr, xzr - # A[1] * B[2] - mul x24, x12, x17 - umulh x25, x12, x17 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[2] * B[1] - mul x24, x13, x16 - umulh x25, x13, x16 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[3] * B[0] - mul x24, x14, x15 - umulh x25, x14, x15 - adds x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, xzr - # A[1] * B[3] - mul x24, x12, x19 - umulh x25, x12, x19 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, xzr, xzr - # A[2] * B[2] - mul x24, x13, x17 - umulh x25, x13, x17 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[3] * B[1] - mul x24, x14, x16 - umulh x25, x14, x16 - adds x7, x7, x24 - adcs x8, x8, x25 - adc x9, x9, xzr - # A[2] * B[3] - mul x24, x13, x19 - umulh x25, x13, x19 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, xzr, xzr - # A[3] * B[2] - mul x24, x14, x17 - umulh x25, x14, x17 - adds x8, x8, x24 - adcs x9, x9, x25 - adc x10, x10, xzr - # A[3] * B[3] - mul x24, x14, x19 - umulh x25, x14, x19 - adds x9, x9, x24 - adc x10, x10, x25 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - extr x7, x7, x6, #63 - and x6, x6, #0x7fffffffffffffff - # Multiply top half by 19 - mov x24, #19 - mul x25, x24, x7 - umulh x7, x24, x7 - adds x3, x3, x25 - mul x25, x24, x8 - umulh x8, x24, x8 - adcs x4, x4, x25 - mul x25, x24, x9 - umulh x9, x24, x9 - adcs x5, x5, x25 - mul x25, x24, x10 - umulh x26, x24, x10 - adcs x6, x6, x25 - adc x26, x26, xzr - # Add remaining product results in - adds x4, x4, x7 - adcs x5, x5, x8 - adcs x6, x6, x9 - adc x26, x26, xzr - # Overflow - extr x26, x26, x6, #63 - mul x26, x26, x24 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr - # Reduce if top bit set - and x26, x24, x6, asr 63 - and x6, x6, #0x7fffffffffffffff - adds x3, x3, x26 - adcs x4, x4, xzr - adcs x5, x5, xzr - adc x6, x6, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 # Store - stp x3, x4, [x0] - stp x5, x6, [x0, #16] - ldr x17, [x29, #88] - ldr x19, [x29, #96] - ldp x20, x21, [x29, #104] - ldp x22, x23, [x29, #120] - ldp x24, x25, [x29, #136] - ldr x26, [x29, #152] - ldp x29, x30, [sp], #0xa0 + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + ldr x17, [x29, #40] + ldr x19, [x29, #48] + ldp x20, x21, [x29, #56] + ldr x22, [x29, #72] + ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ - .size fe_ge_to_p3,.-fe_ge_to_p3 + .size ge_p1p1_to_p2,.-ge_p1p1_to_p2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_dbl -.type fe_ge_dbl,@function +.globl ge_p1p1_to_p3 +.type ge_p1p1_to_p3,@function .align 2 -fe_ge_dbl: +ge_p1p1_to_p3: #else .section __TEXT,__text -.globl _fe_ge_dbl +.globl _ge_p1p1_to_p3 .p2align 2 -_fe_ge_dbl: +_ge_p1p1_to_p3: #endif /* __APPLE__ */ - stp x29, x30, [sp, #-176]! + stp x29, x30, [sp, #-112]! add x29, sp, #0 - str x17, [x29, #88] - str x19, [x29, #96] - stp x20, x21, [x29, #104] - stp x22, x23, [x29, #120] - stp x24, x25, [x29, #136] - stp x26, x27, [x29, #152] - str x28, [x29, #168] + str x17, [x29, #40] + str x19, [x29, #48] + stp x20, x21, [x29, #56] + stp x22, x23, [x29, #72] + stp x24, x25, [x29, #88] + str x26, [x29, #104] str x0, [x29, #16] str x1, [x29, #24] - str x2, [x29, #32] - str x3, [x29, #40] - str x4, [x29, #48] - str x5, [x29, #56] - str x6, [x29, #64] - ldr x1, [x29, #48] - # Square - ldp x12, x13, [x1] - ldp x14, x15, [x1, #16] - # A[0] * A[1] - mul x5, x12, x13 - umulh x6, x12, x13 - # A[0] * A[2] - mul x25, x12, x14 - umulh x7, x12, x14 - adds x6, x6, x25 - adc x7, x7, xzr - # A[0] * A[3] - mul x25, x12, x15 - umulh x8, x12, x15 - adds x7, x7, x25 - adc x8, x8, xzr - # A[1] * A[2] - mul x25, x13, x14 - umulh x26, x13, x14 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, xzr, xzr - # A[1] * A[3] - mul x25, x13, x15 - umulh x26, x13, x15 - adds x8, x8, x25 - adc x9, x9, x26 - # A[2] * A[3] - mul x25, x14, x15 - umulh x10, x14, x15 - adds x9, x9, x25 - adc x10, x10, xzr - # Double - adds x5, x5, x5 - adcs x6, x6, x6 - adcs x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adc x11, xzr, xzr - # A[0] * A[0] - mul x4, x12, x12 - umulh x27, x12, x12 - # A[1] * A[1] - mul x25, x13, x13 - umulh x26, x13, x13 - adds x5, x5, x27 - adcs x6, x6, x25 - adc x27, x26, xzr - # A[2] * A[2] - mul x25, x14, x14 - umulh x26, x14, x14 - adds x7, x7, x27 - adcs x8, x8, x25 - adc x27, x26, xzr - # A[3] * A[3] - mul x25, x15, x15 - umulh x26, x15, x15 - adds x9, x9, x27 - adcs x10, x10, x25 - adc x11, x11, x26 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x8 - umulh x8, x25, x8 - adds x4, x4, x26 - mul x26, x25, x9 - umulh x9, x25, x9 - adcs x5, x5, x26 - mul x26, x25, x10 - umulh x10, x25, x10 - adcs x6, x6, x26 - mul x26, x25, x11 - umulh x27, x25, x11 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 - mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ldr x0, [x29, #32] - ldr x1, [x29, #56] - # Square - ldp x21, x22, [x1] - ldp x23, x24, [x1, #16] - # A[0] * A[1] - mul x9, x21, x22 - umulh x10, x21, x22 - # A[0] * A[2] - mul x25, x21, x23 - umulh x11, x21, x23 - adds x10, x10, x25 - adc x11, x11, xzr - # A[0] * A[3] - mul x25, x21, x24 - umulh x16, x21, x24 - adds x11, x11, x25 - adc x16, x16, xzr - # A[1] * A[2] - mul x25, x22, x23 - umulh x26, x22, x23 - adds x11, x11, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * A[3] - mul x25, x22, x24 - umulh x26, x22, x24 - adds x16, x16, x25 - adc x17, x17, x26 - # A[2] * A[3] - mul x25, x23, x24 - umulh x19, x23, x24 - adds x17, x17, x25 - adc x19, x19, xzr - # Double - adds x9, x9, x9 - adcs x10, x10, x10 - adcs x11, x11, x11 - adcs x16, x16, x16 - adcs x17, x17, x17 - adcs x19, x19, x19 - adc x20, xzr, xzr - # A[0] * A[0] - mul x8, x21, x21 - umulh x27, x21, x21 - # A[1] * A[1] - mul x25, x22, x22 - umulh x26, x22, x22 - adds x9, x9, x27 - adcs x10, x10, x25 - adc x27, x26, xzr - # A[2] * A[2] - mul x25, x23, x23 - umulh x26, x23, x23 - adds x11, x11, x27 - adcs x16, x16, x25 - adc x27, x26, xzr - # A[3] * A[3] - mul x25, x24, x24 - umulh x26, x24, x24 - adds x17, x17, x27 - adcs x19, x19, x25 - adc x20, x20, x26 + mov x2, x1 + add x1, x1, #0x60 + # Multiply + ldp x10, x11, [x1] + ldp x12, x13, [x1, #16] + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + # A[0] * B[0] + umulh x15, x10, x6 + mul x14, x10, x6 + # A[2] * B[0] + umulh x17, x12, x6 + mul x16, x12, x6 + # A[1] * B[0] + mul x3, x11, x6 + adds x15, x15, x3 + umulh x4, x11, x6 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x11, x9 + mul x19, x11, x9 + # A[0] * B[1] + mul x3, x10, x7 + adds x15, x15, x3 + umulh x4, x10, x7 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x12, x7 + adcs x17, x17, x3 + umulh x4, x12, x7 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[2] + mul x3, x11, x8 + adds x17, x17, x3 + umulh x4, x11, x8 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x10, x8 + adds x16, x16, x3 + umulh x4, x10, x8 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # A[1] * B[1] + mul x3, x11, x7 + adds x16, x16, x3 + umulh x4, x11, x7 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x13, x7 + adcs x19, x19, x3 + umulh x4, x13, x7 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x12, x8 + adds x19, x19, x3 + umulh x4, x12, x8 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x13, x9 + adcs x21, x21, x3 + umulh x22, x13, x9 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x10, x9 + adds x17, x17, x3 + umulh x4, x10, x9 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x12, x9 + adcs x20, x20, x3 + umulh x4, x12, x9 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x13, x6 + adds x17, x17, x3 + umulh x4, x13, x6 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x13, x8 + adcs x20, x20, x3 + umulh x4, x13, x8 + adcs x21, x21, x4 + adc x22, x22, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x11, #63 - and x11, x11, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x8, x8, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x9, x9, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x10, x10, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x11, x11, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x9, x9, x16 - adcs x10, x10, x17 - adcs x11, x11, x19 - adc x27, x27, xzr - # Overflow - extr x27, x27, x11, #63 - mul x27, x27, x25 - and x11, x11, #0x7fffffffffffffff - adds x8, x8, x27 - adcs x9, x9, xzr - adcs x10, x10, xzr - adc x11, x11, xzr - # Reduce if top bit set - and x27, x25, x11, asr 63 - and x11, x11, #0x7fffffffffffffff - adds x8, x8, x27 - adcs x9, x9, xzr - adcs x10, x10, xzr - adc x11, x11, xzr + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 # Store - stp x8, x9, [x0] - stp x10, x11, [x0, #16] - ldr x0, [x29, #24] - # Add - adds x12, x12, x21 - adcs x13, x13, x22 - adcs x14, x14, x23 - adc x15, x15, x24 - mov x25, #-19 - asr x28, x15, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x12, x12, x25 - sbcs x13, x13, x28 - sbcs x14, x14, x28 - sbc x15, x15, x26 - ldr x0, [x29, #40] - # Square - # A[0] * A[1] - mul x17, x12, x13 - umulh x19, x12, x13 - # A[0] * A[2] - mul x25, x12, x14 - umulh x20, x12, x14 - adds x19, x19, x25 + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + sub x1, x1, #0x40 + add x0, x0, #0x60 + # Multiply + ldp x23, x24, [x1] + ldp x25, x26, [x1, #16] + # A[0] * B[0] + umulh x15, x23, x6 + mul x14, x23, x6 + # A[2] * B[0] + umulh x17, x25, x6 + mul x16, x25, x6 + # A[1] * B[0] + mul x3, x24, x6 + adds x15, x15, x3 + umulh x4, x24, x6 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x24, x9 + mul x19, x24, x9 + # A[0] * B[1] + mul x3, x23, x7 + adds x15, x15, x3 + umulh x4, x23, x7 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x25, x7 + adcs x17, x17, x3 + umulh x4, x25, x7 + adcs x19, x19, x4 adc x20, x20, xzr - # A[0] * A[3] - mul x25, x12, x15 - umulh x21, x12, x15 - adds x20, x20, x25 + # A[1] * B[2] + mul x3, x24, x8 + adds x17, x17, x3 + umulh x4, x24, x8 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x23, x8 + adds x16, x16, x3 + umulh x4, x23, x8 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr adc x21, x21, xzr - # A[1] * A[2] - mul x25, x13, x14 - umulh x26, x13, x14 - adds x20, x20, x25 - adcs x21, x21, x26 - adc x22, xzr, xzr - # A[1] * A[3] - mul x25, x13, x15 - umulh x26, x13, x15 - adds x21, x21, x25 - adc x22, x22, x26 - # A[2] * A[3] - mul x25, x14, x15 - umulh x23, x14, x15 - adds x22, x22, x25 - adc x23, x23, xzr - # Double - adds x17, x17, x17 - adcs x19, x19, x19 - adcs x20, x20, x20 - adcs x21, x21, x21 - adcs x22, x22, x22 - adcs x23, x23, x23 - adc x24, xzr, xzr - # A[0] * A[0] - mul x16, x12, x12 - umulh x27, x12, x12 - # A[1] * A[1] - mul x25, x13, x13 - umulh x26, x13, x13 - adds x17, x17, x27 - adcs x19, x19, x25 - adc x27, x26, xzr - # A[2] * A[2] - mul x25, x14, x14 - umulh x26, x14, x14 - adds x20, x20, x27 - adcs x21, x21, x25 - adc x27, x26, xzr - # A[3] * A[3] - mul x25, x15, x15 - umulh x26, x15, x15 - adds x22, x22, x27 - adcs x23, x23, x25 - adc x24, x24, x26 + # A[1] * B[1] + mul x3, x24, x7 + adds x16, x16, x3 + umulh x4, x24, x7 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x26, x7 + adcs x19, x19, x3 + umulh x4, x26, x7 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x25, x8 + adds x19, x19, x3 + umulh x4, x25, x8 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x26, x9 + adcs x21, x21, x3 + umulh x22, x26, x9 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x23, x9 + adds x17, x17, x3 + umulh x4, x23, x9 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x25, x9 + adcs x20, x20, x3 + umulh x4, x25, x9 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x26, x6 + adds x17, x17, x3 + umulh x4, x26, x6 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x26, x8 + adcs x20, x20, x3 + umulh x4, x26, x8 + adcs x21, x21, x4 + adc x22, x22, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x24, x24, x23, #63 - extr x23, x23, x22, #63 - extr x22, x22, x21, #63 - extr x21, x21, x20, #63 - and x20, x20, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x21 - umulh x21, x25, x21 - adds x16, x16, x26 - mul x26, x25, x22 - umulh x22, x25, x22 - adcs x17, x17, x26 - mul x26, x25, x23 - umulh x23, x25, x23 - adcs x19, x19, x26 - mul x26, x25, x24 - umulh x27, x25, x24 - adcs x20, x20, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x17, x17, x21 - adcs x19, x19, x22 - adcs x20, x20, x23 - adc x27, x27, xzr - # Overflow - extr x27, x27, x20, #63 - mul x27, x27, x25 - and x20, x20, #0x7fffffffffffffff - adds x16, x16, x27 - adcs x17, x17, xzr - adcs x19, x19, xzr + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 + # Store + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + add x2, x1, #32 + sub x0, x0, #0x40 + # Multiply + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + # A[0] * B[0] + umulh x15, x23, x6 + mul x14, x23, x6 + # A[2] * B[0] + umulh x17, x25, x6 + mul x16, x25, x6 + # A[1] * B[0] + mul x3, x24, x6 + adds x15, x15, x3 + umulh x4, x24, x6 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x24, x9 + mul x19, x24, x9 + # A[0] * B[1] + mul x3, x23, x7 + adds x15, x15, x3 + umulh x4, x23, x7 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x25, x7 + adcs x17, x17, x3 + umulh x4, x25, x7 + adcs x19, x19, x4 adc x20, x20, xzr - # Reduce if top bit set - and x27, x25, x20, asr 63 - and x20, x20, #0x7fffffffffffffff - adds x16, x16, x27 - adcs x17, x17, xzr + # A[1] * B[2] + mul x3, x24, x8 + adds x17, x17, x3 + umulh x4, x24, x8 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x23, x8 + adds x16, x16, x3 + umulh x4, x23, x8 + adcs x17, x17, x4 adcs x19, x19, xzr - adc x20, x20, xzr - # Store - stp x16, x17, [x0] - stp x19, x20, [x0, #16] - ldr x0, [x29, #24] - ldr x1, [x29, #32] - # Add - adds x12, x8, x4 - adcs x13, x9, x5 - adcs x14, x10, x6 - adc x15, x11, x7 - mov x25, #-19 - asr x28, x15, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x12, x12, x25 - sbcs x13, x13, x28 - sbcs x14, x14, x28 - sbc x15, x15, x26 - # Sub - subs x21, x8, x4 - sbcs x22, x9, x5 - sbcs x23, x10, x6 - sbcs x24, x11, x7 - mov x25, #-19 - csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x21, x21, x25 - adcs x22, x22, x28 - adcs x23, x23, x28 - adc x24, x24, x26 - stp x12, x13, [x0] - stp x14, x15, [x0, #16] - stp x21, x22, [x1] - stp x23, x24, [x1, #16] - ldr x0, [x29, #16] - # Sub - subs x16, x16, x12 - sbcs x17, x17, x13 - sbcs x19, x19, x14 - sbcs x20, x20, x15 - mov x25, #-19 - csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 - stp x16, x17, [x0] - stp x19, x20, [x0, #16] - ldr x0, [x29, #40] - ldr x1, [x29, #64] - # Square * 2 - ldp x12, x13, [x1] - ldp x14, x15, [x1, #16] - # A[0] * A[1] - mul x5, x12, x13 - umulh x6, x12, x13 - # A[0] * A[2] - mul x25, x12, x14 - umulh x7, x12, x14 - adds x6, x6, x25 - adc x7, x7, xzr - # A[0] * A[3] - mul x25, x12, x15 - umulh x8, x12, x15 - adds x7, x7, x25 - adc x8, x8, xzr - # A[1] * A[2] - mul x25, x13, x14 - umulh x26, x13, x14 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, xzr, xzr - # A[1] * A[3] - mul x25, x13, x15 - umulh x26, x13, x15 - adds x8, x8, x25 - adc x9, x9, x26 - # A[2] * A[3] - mul x25, x14, x15 - umulh x10, x14, x15 - adds x9, x9, x25 - adc x10, x10, xzr - # Double - adds x5, x5, x5 - adcs x6, x6, x6 - adcs x7, x7, x7 - adcs x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adc x11, xzr, xzr - # A[0] * A[0] - mul x4, x12, x12 - umulh x28, x12, x12 - # A[1] * A[1] - mul x25, x13, x13 - umulh x26, x13, x13 - adds x5, x5, x28 - adcs x6, x6, x25 - adc x28, x26, xzr - # A[2] * A[2] - mul x25, x14, x14 - umulh x26, x14, x14 - adds x7, x7, x28 - adcs x8, x8, x25 - adc x28, x26, xzr - # A[3] * A[3] - mul x25, x15, x15 - umulh x26, x15, x15 - adds x9, x9, x28 - adcs x10, x10, x25 - adc x11, x11, x26 - # Double and Reduce - mov x25, #0x169 - # Move top half into t4-t7 and remove top bit from t3 - lsr x28, x11, #61 - extr x11, x11, x10, #62 - extr x10, x10, x9, #62 - extr x9, x9, x8, #62 - extr x8, x8, x7, #62 - extr x7, x7, x6, #63 - extr x6, x6, x5, #63 - extr x5, x5, x4, #63 - lsl x4, x4, #1 - and x7, x7, #0x7fffffffffffffff - # Two left, only one right - and x11, x11, #0x7fffffffffffffff - # Multiply top bits by 19*19 - mul x28, x28, x25 - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x8 - umulh x8, x25, x8 - adds x4, x4, x26 - mul x26, x25, x9 - umulh x9, x25, x9 - adcs x5, x5, x26 - mul x26, x25, x10 - umulh x10, x25, x10 - adcs x6, x6, x26 - mul x26, x25, x11 - umulh x27, x25, x11 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x4, x4, x28 - adcs x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 - mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # A[1] * B[1] + mul x3, x24, x7 + adds x16, x16, x3 + umulh x4, x24, x7 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x26, x7 + adcs x19, x19, x3 + umulh x4, x26, x7 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x25, x8 + adds x19, x19, x3 + umulh x4, x25, x8 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x26, x9 + adcs x21, x21, x3 + umulh x22, x26, x9 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x23, x9 + adds x17, x17, x3 + umulh x4, x23, x9 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x25, x9 + adcs x20, x20, x3 + umulh x4, x25, x9 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x26, x6 + adds x17, x17, x3 + umulh x4, x26, x6 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x26, x8 + adcs x20, x20, x3 + umulh x4, x26, x8 + adcs x21, x21, x4 + adc x22, x22, xzr + # Reduce + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 # Store - ldr x0, [x29, #40] - # Sub - subs x4, x4, x21 - sbcs x5, x5, x22 - sbcs x6, x6, x23 - sbcs x7, x7, x24 - mov x25, #-19 - csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x4, x4, x25 - adcs x5, x5, x28 - adcs x6, x6, x28 - adc x7, x7, x26 - stp x4, x5, [x0] - stp x6, x7, [x0, #16] - ldr x17, [x29, #88] - ldr x19, [x29, #96] - ldp x20, x21, [x29, #104] - ldp x22, x23, [x29, #120] - ldp x24, x25, [x29, #136] - ldp x26, x27, [x29, #152] - ldr x28, [x29, #168] - ldp x29, x30, [sp], #0xb0 + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + add x1, x1, #0x40 + add x0, x0, #32 + # Multiply + # A[0] * B[0] + umulh x15, x10, x6 + mul x14, x10, x6 + # A[2] * B[0] + umulh x17, x12, x6 + mul x16, x12, x6 + # A[1] * B[0] + mul x3, x11, x6 + adds x15, x15, x3 + umulh x4, x11, x6 + adcs x16, x16, x4 + adc x17, x17, xzr + # A[1] * B[3] + umulh x20, x11, x9 + mul x19, x11, x9 + # A[0] * B[1] + mul x3, x10, x7 + adds x15, x15, x3 + umulh x4, x10, x7 + adcs x16, x16, x4 + # A[2] * B[1] + mul x3, x12, x7 + adcs x17, x17, x3 + umulh x4, x12, x7 + adcs x19, x19, x4 + adc x20, x20, xzr + # A[1] * B[2] + mul x3, x11, x8 + adds x17, x17, x3 + umulh x4, x11, x8 + adcs x19, x19, x4 + adcs x20, x20, xzr + adc x21, xzr, xzr + # A[0] * B[2] + mul x3, x10, x8 + adds x16, x16, x3 + umulh x4, x10, x8 + adcs x17, x17, x4 + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + # A[1] * B[1] + mul x3, x11, x7 + adds x16, x16, x3 + umulh x4, x11, x7 + adcs x17, x17, x4 + # A[3] * B[1] + mul x3, x13, x7 + adcs x19, x19, x3 + umulh x4, x13, x7 + adcs x20, x20, x4 + adc x21, x21, xzr + # A[2] * B[2] + mul x3, x12, x8 + adds x19, x19, x3 + umulh x4, x12, x8 + adcs x20, x20, x4 + # A[3] * B[3] + mul x3, x13, x9 + adcs x21, x21, x3 + umulh x22, x13, x9 + adc x22, x22, xzr + # A[0] * B[3] + mul x3, x10, x9 + adds x17, x17, x3 + umulh x4, x10, x9 + adcs x19, x19, x4 + # A[2] * B[3] + mul x3, x12, x9 + adcs x20, x20, x3 + umulh x4, x12, x9 + adcs x21, x21, x4 + adc x22, x22, xzr + # A[3] * B[0] + mul x3, x13, x6 + adds x17, x17, x3 + umulh x4, x13, x6 + adcs x19, x19, x4 + # A[3] * B[2] + mul x3, x13, x8 + adcs x20, x20, x3 + umulh x4, x13, x8 + adcs x21, x21, x4 + adc x22, x22, xzr + # Reduce + mov x3, #38 + mul x4, x3, x22 + adds x17, x17, x4 + umulh x5, x3, x22 + adc x5, x5, xzr + mov x3, #19 + extr x5, x5, x17, #63 + mul x5, x5, x3 + and x17, x17, #0x7fffffffffffffff + mov x3, #38 + mul x4, x3, x19 + adds x14, x14, x4 + umulh x19, x3, x19 + mul x4, x3, x20 + adcs x15, x15, x4 + umulh x20, x3, x20 + mul x4, x3, x21 + adcs x16, x16, x4 + umulh x21, x3, x21 + adc x17, x17, xzr + # Add high product results in + adds x14, x14, x5 + adcs x15, x15, x19 + adcs x16, x16, x20 + adc x17, x17, x21 + # Store + stp x14, x15, [x0] + stp x16, x17, [x0, #16] + ldr x17, [x29, #40] + ldr x19, [x29, #48] + ldp x20, x21, [x29, #56] + ldp x22, x23, [x29, #72] + ldp x24, x25, [x29, #88] + ldr x26, [x29, #104] + ldp x29, x30, [sp], #0x70 ret #ifndef __APPLE__ - .size fe_ge_dbl,.-fe_ge_dbl + .size ge_p1p1_to_p3,.-ge_p1p1_to_p3 #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_madd -.type fe_ge_madd,@function +.globl ge_p2_dbl +.type ge_p2_dbl,@function .align 2 -fe_ge_madd: +ge_p2_dbl: #else .section __TEXT,__text -.globl _fe_ge_madd +.globl _ge_p2_dbl .p2align 2 -_fe_ge_madd: +_ge_p2_dbl: #endif /* __APPLE__ */ - stp x29, x30, [sp, #-176]! + stp x29, x30, [sp, #-128]! add x29, sp, #0 - str x17, [x29, #88] - str x19, [x29, #96] - stp x20, x21, [x29, #104] - stp x22, x23, [x29, #120] - stp x24, x25, [x29, #136] - stp x26, x27, [x29, #152] - str x28, [x29, #168] + str x17, [x29, #40] + str x19, [x29, #48] + stp x20, x21, [x29, #56] + stp x22, x23, [x29, #72] + stp x24, x25, [x29, #88] + stp x26, x27, [x29, #104] + str x28, [x29, #120] str x0, [x29, #16] str x1, [x29, #24] - str x2, [x29, #32] - str x3, [x29, #40] - str x4, [x29, #48] - str x5, [x29, #56] - str x6, [x29, #64] - str x7, [x29, #72] - ldr x2, [x29, #56] - ldr x3, [x29, #48] - # Add - ldp x12, x13, [x2] - ldp x14, x15, [x2, #16] - ldp x16, x17, [x3] - ldp x19, x20, [x3, #16] - adds x4, x12, x16 - adcs x5, x13, x17 - adcs x6, x14, x19 - adc x7, x15, x20 - mov x25, #-19 - asr x28, x7, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x4, x4, x25 - sbcs x5, x5, x28 - sbcs x6, x6, x28 - sbc x7, x7, x26 - # Sub - subs x8, x12, x16 - sbcs x9, x13, x17 - sbcs x10, x14, x19 - sbcs x11, x15, x20 - mov x25, #-19 - csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x8, x8, x25 - adcs x9, x9, x28 - adcs x10, x10, x28 - adc x11, x11, x26 - ldr x0, [x29, #32] - ldr x2, [x29, #184] - # Multiply - ldp x21, x22, [x2] - ldp x23, x24, [x2, #16] - # A[0] * B[0] - mul x12, x4, x21 - umulh x13, x4, x21 - # A[0] * B[1] - mul x25, x4, x22 - umulh x14, x4, x22 - adds x13, x13, x25 + add x0, x0, #0x40 + # Square + ldp x4, x5, [x1] + ldp x6, x7, [x1, #16] + # A[0] * A[1] + umulh x10, x4, x5 + mul x9, x4, x5 + # A[0] * A[3] + umulh x12, x4, x7 + mul x11, x4, x7 + # A[0] * A[2] + mul x25, x4, x6 + adds x10, x10, x25 + umulh x26, x4, x6 + adcs x11, x11, x26 + # A[1] * A[3] + mul x25, x5, x7 + adcs x12, x12, x25 + umulh x13, x5, x7 + adc x13, x13, xzr + # A[1] * A[2] + mul x25, x5, x6 + adds x11, x11, x25 + umulh x26, x5, x6 + adcs x12, x12, x26 + # A[2] * A[3] + mul x25, x6, x7 + adcs x13, x13, x25 + umulh x14, x6, x7 adc x14, x14, xzr - # A[1] * B[0] - mul x25, x5, x21 - umulh x26, x5, x21 - adds x13, x13, x25 - adcs x14, x14, x26 + # Double + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 adc x15, xzr, xzr - # A[0] * B[2] - mul x25, x4, x23 - umulh x26, x4, x23 - adds x14, x14, x25 + # A[0] * A[0] + umulh x26, x4, x4 + mul x8, x4, x4 + # A[1] * A[1] + mul x25, x5, x5 + adds x9, x9, x26 + umulh x26, x5, x5 + adcs x10, x10, x25 + # A[2] * A[2] + mul x25, x6, x6 + adcs x11, x11, x26 + umulh x26, x6, x6 + adcs x12, x12, x25 + # A[3] * A[3] + mul x25, x7, x7 + adcs x13, x13, x26 + umulh x26, x7, x7 + adcs x14, x14, x25 adc x15, x15, x26 - # A[1] * B[1] - mul x25, x5, x22 - umulh x26, x5, x22 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x6, x21 - umulh x26, x6, x21 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x4, x24 - umulh x26, x4, x24 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x5, x23 - umulh x26, x5, x23 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x6, x22 - umulh x26, x6, x22 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[3] * B[0] - mul x25, x7, x21 - umulh x26, x7, x21 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x5, x24 - umulh x26, x5, x24 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x6, x23 - umulh x26, x6, x23 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[3] * B[1] - mul x25, x7, x22 - umulh x26, x7, x22 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x6, x24 - umulh x26, x6, x24 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x7, x23 - umulh x26, x7, x23 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, x20, xzr - # A[3] * B[3] - mul x25, x7, x24 - umulh x26, x7, x24 - adds x19, x19, x25 - adc x20, x20, x26 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x15, #63 - and x15, x15, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x12, x12, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x13, x13, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x14, x14, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x15, x15, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x13, x13, x16 - adcs x14, x14, x17 - adcs x15, x15, x19 + mov x25, #38 + mul x26, x25, x15 + adds x11, x11, x26 + umulh x27, x25, x15 adc x27, x27, xzr - # Overflow - extr x27, x27, x15, #63 + mov x25, #19 + extr x27, x27, x11, #63 mul x27, x27, x25 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr - # Reduce if top bit set - and x27, x25, x15, asr 63 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr + and x11, x11, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x12 + adds x8, x8, x26 + umulh x12, x25, x12 + mul x26, x25, x13 + adcs x9, x9, x26 + umulh x13, x25, x13 + mul x26, x25, x14 + adcs x10, x10, x26 + umulh x14, x25, x14 + adc x11, x11, xzr + # Add high product results in + adds x8, x8, x27 + adcs x9, x9, x12 + adcs x10, x10, x13 + adc x11, x11, x14 # Store - ldr x0, [x29, #24] - ldr x1, [x29, #192] - # Multiply - ldp x21, x22, [x1] - ldp x23, x24, [x1, #16] - # A[0] * B[0] - mul x4, x8, x21 - umulh x5, x8, x21 - # A[0] * B[1] - mul x25, x8, x22 - umulh x6, x8, x22 - adds x5, x5, x25 + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + add x2, x1, #32 + sub x0, x0, #32 + # Square + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * A[1] + umulh x23, x16, x17 + mul x22, x16, x17 + # A[0] * A[3] + umulh x4, x16, x20 + mul x24, x16, x20 + # A[0] * A[2] + mul x25, x16, x19 + adds x23, x23, x25 + umulh x26, x16, x19 + adcs x24, x24, x26 + # A[1] * A[3] + mul x25, x17, x20 + adcs x4, x4, x25 + umulh x5, x17, x20 + adc x5, x5, xzr + # A[1] * A[2] + mul x25, x17, x19 + adds x24, x24, x25 + umulh x26, x17, x19 + adcs x4, x4, x26 + # A[2] * A[3] + mul x25, x19, x20 + adcs x5, x5, x25 + umulh x6, x19, x20 adc x6, x6, xzr - # A[1] * B[0] - mul x25, x9, x21 - umulh x26, x9, x21 - adds x5, x5, x25 - adcs x6, x6, x26 + # Double + adds x22, x22, x22 + adcs x23, x23, x23 + adcs x24, x24, x24 + adcs x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x8, x23 - umulh x26, x8, x23 - adds x6, x6, x25 + # A[0] * A[0] + umulh x26, x16, x16 + mul x21, x16, x16 + # A[1] * A[1] + mul x25, x17, x17 + adds x22, x22, x26 + umulh x26, x17, x17 + adcs x23, x23, x25 + # A[2] * A[2] + mul x25, x19, x19 + adcs x24, x24, x26 + umulh x26, x19, x19 + adcs x4, x4, x25 + # A[3] * A[3] + mul x25, x20, x20 + adcs x5, x5, x26 + umulh x26, x20, x20 + adcs x6, x6, x25 adc x7, x7, x26 - # A[1] * B[1] - mul x25, x9, x22 - umulh x26, x9, x22 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x10, x21 - umulh x26, x10, x21 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x8, x24 - umulh x26, x8, x24 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x9, x23 - umulh x26, x9, x23 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x10, x22 - umulh x26, x10, x22 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[3] * B[0] - mul x25, x11, x21 - umulh x26, x11, x21 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x9, x24 - umulh x26, x9, x24 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x10, x23 - umulh x26, x10, x23 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[3] * B[1] - mul x25, x11, x22 - umulh x26, x11, x22 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x10, x24 - umulh x26, x10, x24 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x11, x23 - umulh x26, x11, x23 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, x20, xzr - # A[3] * B[3] - mul x25, x11, x24 - umulh x26, x11, x24 - adds x19, x19, x25 - adc x20, x20, x26 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x4, x4, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x5, x5, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x6, x6, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x16 - adcs x6, x6, x17 - adcs x7, x7, x19 + mov x25, #38 + mul x26, x25, x7 + adds x24, x24, x26 + umulh x27, x25, x7 adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 + mov x25, #19 + extr x27, x27, x24, #63 mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 + and x24, x24, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x4 + adds x21, x21, x26 + umulh x4, x25, x4 + mul x26, x25, x5 + adcs x22, x22, x26 + umulh x5, x25, x5 + mul x26, x25, x6 + adcs x23, x23, x26 + umulh x6, x25, x6 + adc x24, x24, xzr + # Add high product results in + adds x21, x21, x27 + adcs x22, x22, x4 + adcs x23, x23, x5 + adc x24, x24, x6 + add x3, x0, #32 + mov x2, x0 + add x1, x0, #32 + # Add + adds x4, x21, x8 + adcs x5, x22, x9 + adcs x6, x23, x10 + adcs x7, x24, x11 + cset x28, cs + mov x25, #19 + extr x28, x28, x7, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x4, x4, x25 adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr adcs x6, x6, xzr adc x7, x7, xzr - # Store - ldr x0, [x29, #24] - ldr x1, [x29, #16] - # Add - adds x8, x12, x4 - adcs x9, x13, x5 - adcs x10, x14, x6 - adc x11, x15, x7 + # Sub + subs x12, x21, x8 + sbcs x13, x22, x9 + sbcs x14, x23, x10 + sbcs x15, x24, x11 + csetm x28, cc mov x25, #-19 - asr x28, x11, #63 + extr x28, x28, x15, #63 + mul x25, x28, x25 + # Add modulus (if underflow) + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + ldr x1, [x29, #24] + add x2, x1, #32 + sub x0, x0, #32 + # Add + ldp x8, x9, [x1] + ldp x10, x11, [x1, #16] + adds x8, x8, x16 + adcs x9, x9, x17 + adcs x10, x10, x19 + adcs x11, x11, x20 + cset x28, cs + mov x25, #19 # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + extr x28, x28, x11, #63 + mul x25, x28, x25 # Sub modulus (if overflow) - subs x8, x8, x25 - sbcs x9, x9, x28 - sbcs x10, x10, x28 - sbc x11, x11, x26 + adds x8, x8, x25 + adcs x9, x9, xzr + and x11, x11, #0x7fffffffffffffff + adcs x10, x10, xzr + adc x11, x11, xzr + mov x1, x0 + # Square + # A[0] * A[1] + umulh x23, x8, x9 + mul x22, x8, x9 + # A[0] * A[3] + umulh x4, x8, x11 + mul x24, x8, x11 + # A[0] * A[2] + mul x25, x8, x10 + adds x23, x23, x25 + umulh x26, x8, x10 + adcs x24, x24, x26 + # A[1] * A[3] + mul x25, x9, x11 + adcs x4, x4, x25 + umulh x5, x9, x11 + adc x5, x5, xzr + # A[1] * A[2] + mul x25, x9, x10 + adds x24, x24, x25 + umulh x26, x9, x10 + adcs x4, x4, x26 + # A[2] * A[3] + mul x25, x10, x11 + adcs x5, x5, x25 + umulh x6, x10, x11 + adc x6, x6, xzr + # Double + adds x22, x22, x22 + adcs x23, x23, x23 + adcs x24, x24, x24 + adcs x4, x4, x4 + adcs x5, x5, x5 + adcs x6, x6, x6 + adc x7, xzr, xzr + # A[0] * A[0] + umulh x26, x8, x8 + mul x21, x8, x8 + # A[1] * A[1] + mul x25, x9, x9 + adds x22, x22, x26 + umulh x26, x9, x9 + adcs x23, x23, x25 + # A[2] * A[2] + mul x25, x10, x10 + adcs x24, x24, x26 + umulh x26, x10, x10 + adcs x4, x4, x25 + # A[3] * A[3] + mul x25, x11, x11 + adcs x5, x5, x26 + umulh x26, x11, x11 + adcs x6, x6, x25 + adc x7, x7, x26 + # Reduce + mov x25, #38 + mul x26, x25, x7 + adds x24, x24, x26 + umulh x27, x25, x7 + adc x27, x27, xzr + mov x25, #19 + extr x27, x27, x24, #63 + mul x27, x27, x25 + and x24, x24, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x4 + adds x21, x21, x26 + umulh x4, x25, x4 + mul x26, x25, x5 + adcs x22, x22, x26 + umulh x5, x25, x5 + mul x26, x25, x6 + adcs x23, x23, x26 + umulh x6, x25, x6 + adc x24, x24, xzr + # Add high product results in + adds x21, x21, x27 + adcs x22, x22, x4 + adcs x23, x23, x5 + adc x24, x24, x6 + add x2, x0, #32 # Sub - subs x16, x12, x4 - sbcs x17, x13, x5 - sbcs x19, x14, x6 - sbcs x20, x15, x7 - mov x25, #-19 + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + subs x21, x21, x8 + sbcs x22, x22, x9 + sbcs x23, x23, x10 + sbcs x24, x24, x11 csetm x28, cc + mov x25, #-19 # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + extr x28, x28, x24, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 - stp x8, x9, [x0] - stp x10, x11, [x0, #16] - stp x16, x17, [x1] - stp x19, x20, [x1, #16] - ldr x0, [x29, #40] - ldr x1, [x29, #176] - ldr x3, [x29, #72] - # Multiply - ldp x16, x17, [x1] - ldp x19, x20, [x1, #16] - ldp x21, x22, [x3] - ldp x23, x24, [x3, #16] - # A[0] * B[0] - mul x4, x16, x21 - umulh x5, x16, x21 - # A[0] * B[1] - mul x25, x16, x22 - umulh x6, x16, x22 - adds x5, x5, x25 - adc x6, x6, xzr - # A[1] * B[0] - mul x25, x17, x21 - umulh x26, x17, x21 - adds x5, x5, x25 - adcs x6, x6, x26 - adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x16, x23 - umulh x26, x16, x23 - adds x6, x6, x25 - adc x7, x7, x26 - # A[1] * B[1] - mul x25, x17, x22 - umulh x26, x17, x22 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x8, xzr, xzr - # A[2] * B[0] - mul x25, x19, x21 - umulh x26, x19, x21 + subs x21, x21, x25 + sbcs x22, x22, xzr + and x24, x24, #0x7fffffffffffffff + sbcs x23, x23, xzr + sbc x24, x24, xzr + stp x21, x22, [x0] + stp x23, x24, [x0, #16] + ldr x2, [x29, #24] + add x2, x2, #0x40 + add x0, x0, #0x60 + # Square * 2 + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * A[1] + umulh x6, x16, x17 + mul x5, x16, x17 + # A[0] * A[3] + umulh x8, x16, x20 + mul x7, x16, x20 + # A[0] * A[2] + mul x25, x16, x19 adds x6, x6, x25 + umulh x26, x16, x19 adcs x7, x7, x26 - adc x8, x8, xzr - # A[0] * B[3] - mul x25, x16, x24 - umulh x26, x16, x24 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, xzr, xzr - # A[1] * B[2] - mul x25, x17, x23 - umulh x26, x17, x23 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, x9, xzr - # A[2] * B[1] - mul x25, x19, x22 - umulh x26, x19, x22 - adds x7, x7, x25 - adcs x8, x8, x26 + # A[1] * A[3] + mul x25, x17, x20 + adcs x8, x8, x25 + umulh x9, x17, x20 adc x9, x9, xzr - # A[3] * B[0] - mul x25, x20, x21 - umulh x26, x20, x21 + # A[1] * A[2] + mul x25, x17, x19 adds x7, x7, x25 + umulh x26, x17, x19 adcs x8, x8, x26 - adc x9, x9, xzr - # A[1] * B[3] - mul x25, x17, x24 - umulh x26, x17, x24 - adds x8, x8, x25 - adcs x9, x9, x26 - adc x10, xzr, xzr - # A[2] * B[2] - mul x25, x19, x23 - umulh x26, x19, x23 - adds x8, x8, x25 - adcs x9, x9, x26 - adc x10, x10, xzr - # A[3] * B[1] - mul x25, x20, x22 - umulh x26, x20, x22 - adds x8, x8, x25 - adcs x9, x9, x26 + # A[2] * A[3] + mul x25, x19, x20 + adcs x9, x9, x25 + umulh x10, x19, x20 adc x10, x10, xzr - # A[2] * B[3] - mul x25, x19, x24 - umulh x26, x19, x24 - adds x9, x9, x25 - adcs x10, x10, x26 + # Double + adds x5, x5, x5 + adcs x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 adc x11, xzr, xzr - # A[3] * B[2] - mul x25, x20, x23 - umulh x26, x20, x23 - adds x9, x9, x25 - adcs x10, x10, x26 - adc x11, x11, xzr - # A[3] * B[3] - mul x25, x20, x24 - umulh x26, x20, x24 - adds x10, x10, x25 + # A[0] * A[0] + umulh x26, x16, x16 + mul x4, x16, x16 + # A[1] * A[1] + mul x25, x17, x17 + adds x5, x5, x26 + umulh x26, x17, x17 + adcs x6, x6, x25 + # A[2] * A[2] + mul x25, x19, x19 + adcs x7, x7, x26 + umulh x26, x19, x19 + adcs x8, x8, x25 + # A[3] * A[3] + mul x25, x20, x20 + adcs x9, x9, x26 + umulh x26, x20, x20 + adcs x10, x10, x25 adc x11, x11, x26 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x8 - umulh x8, x25, x8 - adds x4, x4, x26 - mul x26, x25, x9 - umulh x9, x25, x9 - adcs x5, x5, x26 - mul x26, x25, x10 - umulh x10, x25, x10 - adcs x6, x6, x26 + mov x25, #38 mul x26, x25, x11 + adds x7, x7, x26 umulh x27, x25, x11 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 - mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - ldr x0, [x29, #32] - ldr x1, [x29, #64] - # Double - ldp x8, x9, [x1] - ldp x10, x11, [x1, #16] - adds x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adc x11, x11, x11 - mov x25, #-19 - asr x28, x11, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x8, x8, x25 - sbcs x9, x9, x28 - sbcs x10, x10, x28 - sbc x11, x11, x26 - ldr x1, [x29, #40] - # Add - adds x12, x8, x4 - adcs x13, x9, x5 - adcs x14, x10, x6 - adc x15, x11, x7 - mov x25, #-19 - asr x28, x15, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x12, x12, x25 - sbcs x13, x13, x28 - sbcs x14, x14, x28 - sbc x15, x15, x26 + adc x27, x27, xzr + mov x25, #19 + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x8 + adds x4, x4, x26 + umulh x8, x25, x8 + mul x26, x25, x9 + adcs x5, x5, x26 + umulh x9, x25, x9 + mul x26, x25, x10 + adcs x6, x6, x26 + umulh x10, x25, x10 + adc x7, x7, xzr + # Add high product results in + adds x4, x4, x27 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x7, x7, x10 + mov x25, #19 + lsr x26, x7, #62 + extr x7, x7, x6, #63 + extr x6, x6, x5, #63 + extr x5, x5, x4, #63 + lsl x4, x4, #1 + mul x26, x26, x25 + adds x4, x4, x26 + adcs x5, x5, xzr + and x7, x7, #0x7fffffffffffffff + adcs x6, x6, xzr + adc x7, x7, xzr + # Store + sub x1, x0, #32 # Sub - subs x16, x8, x4 - sbcs x17, x9, x5 - sbcs x19, x10, x6 - sbcs x20, x11, x7 - mov x25, #-19 + subs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 csetm x28, cc + mov x25, #-19 # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + extr x28, x28, x7, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 - stp x12, x13, [x0] - stp x14, x15, [x0, #16] - stp x16, x17, [x1] - stp x19, x20, [x1, #16] - ldr x17, [x29, #88] - ldr x19, [x29, #96] - ldp x20, x21, [x29, #104] - ldp x22, x23, [x29, #120] - ldp x24, x25, [x29, #136] - ldp x26, x27, [x29, #152] - ldr x28, [x29, #168] - ldp x29, x30, [sp], #0xb0 + subs x4, x4, x25 + sbcs x5, x5, xzr + and x7, x7, #0x7fffffffffffffff + sbcs x6, x6, xzr + sbc x7, x7, xzr + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x17, [x29, #40] + ldr x19, [x29, #48] + ldp x20, x21, [x29, #56] + ldp x22, x23, [x29, #72] + ldp x24, x25, [x29, #88] + ldp x26, x27, [x29, #104] + ldr x28, [x29, #120] + ldp x29, x30, [sp], #0x80 ret #ifndef __APPLE__ - .size fe_ge_madd,.-fe_ge_madd + .size ge_p2_dbl,.-ge_p2_dbl #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_msub -.type fe_ge_msub,@function +.globl ge_madd +.type ge_madd,@function .align 2 -fe_ge_msub: +ge_madd: #else .section __TEXT,__text -.globl _fe_ge_msub +.globl _ge_madd .p2align 2 -_fe_ge_msub: +_ge_madd: #endif /* __APPLE__ */ - stp x29, x30, [sp, #-176]! + stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #88] - str x19, [x29, #96] - stp x20, x21, [x29, #104] - stp x22, x23, [x29, #120] - stp x24, x25, [x29, #136] - stp x26, x27, [x29, #152] - str x28, [x29, #168] + str x17, [x29, #56] + str x19, [x29, #64] + stp x20, x21, [x29, #72] + stp x22, x23, [x29, #88] + stp x24, x25, [x29, #104] + stp x26, x27, [x29, #120] + str x28, [x29, #136] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] - str x3, [x29, #40] - str x4, [x29, #48] - str x5, [x29, #56] - str x6, [x29, #64] - str x7, [x29, #72] - ldr x2, [x29, #56] - ldr x3, [x29, #48] + mov x3, x1 + add x2, x1, #32 + add x1, x0, #32 # Add - ldp x12, x13, [x2] - ldp x14, x15, [x2, #16] - ldp x16, x17, [x3] - ldp x19, x20, [x3, #16] - adds x4, x12, x16 - adcs x5, x13, x17 - adcs x6, x14, x19 - adc x7, x15, x20 - mov x25, #-19 - asr x28, x7, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + adds x16, x8, x4 + adcs x17, x9, x5 + adcs x19, x10, x6 + adcs x20, x11, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x20, #63 + mul x25, x28, x25 # Sub modulus (if overflow) - subs x4, x4, x25 - sbcs x5, x5, x28 - sbcs x6, x6, x28 - sbc x7, x7, x26 + adds x16, x16, x25 + adcs x17, x17, xzr + and x20, x20, #0x7fffffffffffffff + adcs x19, x19, xzr + adc x20, x20, xzr # Sub - subs x8, x12, x16 - sbcs x9, x13, x17 - sbcs x10, x14, x19 - sbcs x11, x15, x20 - mov x25, #-19 + subs x12, x8, x4 + sbcs x13, x9, x5 + sbcs x14, x10, x6 + sbcs x15, x11, x7 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + mov x25, #-19 + extr x28, x28, x15, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x8, x8, x25 - adcs x9, x9, x28 - adcs x10, x10, x28 - adc x11, x11, x26 - ldr x0, [x29, #32] - ldr x2, [x29, #192] + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + ldr x2, [x29, #32] + mov x1, x0 # Multiply - ldp x21, x22, [x2] - ldp x23, x24, [x2, #16] - # A[0] * B[0] - mul x12, x4, x21 - umulh x13, x4, x21 - # A[0] * B[1] - mul x25, x4, x22 - umulh x14, x4, x22 - adds x13, x13, x25 - adc x14, x14, xzr - # A[1] * B[0] - mul x25, x5, x21 - umulh x26, x5, x21 - adds x13, x13, x25 - adcs x14, x14, x26 - adc x15, xzr, xzr - # A[0] * B[2] - mul x25, x4, x23 - umulh x26, x4, x23 - adds x14, x14, x25 - adc x15, x15, x26 - # A[1] * B[1] - mul x25, x5, x22 - umulh x26, x5, x22 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x6, x21 - umulh x26, x6, x21 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x4, x24 - umulh x26, x4, x24 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x5, x23 - umulh x26, x5, x23 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x6, x22 - umulh x26, x6, x22 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[3] * B[0] - mul x25, x7, x21 - umulh x26, x7, x21 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x5, x24 - umulh x26, x5, x24 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x6, x23 - umulh x26, x6, x23 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[3] * B[1] - mul x25, x7, x22 - umulh x26, x7, x22 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x6, x24 - umulh x26, x6, x24 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x7, x23 - umulh x26, x7, x23 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, x20, xzr - # A[3] * B[3] - mul x25, x7, x24 - umulh x26, x7, x24 - adds x19, x19, x25 - adc x20, x20, x26 + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + # A[0] * B[0] + umulh x22, x16, x8 + mul x21, x16, x8 + # A[2] * B[0] + umulh x24, x19, x8 + mul x23, x19, x8 + # A[1] * B[0] + mul x25, x17, x8 + adds x22, x22, x25 + umulh x26, x17, x8 + adcs x23, x23, x26 + adc x24, x24, xzr + # A[1] * B[3] + umulh x5, x17, x11 + mul x4, x17, x11 + # A[0] * B[1] + mul x25, x16, x9 + adds x22, x22, x25 + umulh x26, x16, x9 + adcs x23, x23, x26 + # A[2] * B[1] + mul x25, x19, x9 + adcs x24, x24, x25 + umulh x26, x19, x9 + adcs x4, x4, x26 + adc x5, x5, xzr + # A[1] * B[2] + mul x25, x17, x10 + adds x24, x24, x25 + umulh x26, x17, x10 + adcs x4, x4, x26 + adcs x5, x5, xzr + adc x6, xzr, xzr + # A[0] * B[2] + mul x25, x16, x10 + adds x23, x23, x25 + umulh x26, x16, x10 + adcs x24, x24, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # A[1] * B[1] + mul x25, x17, x9 + adds x23, x23, x25 + umulh x26, x17, x9 + adcs x24, x24, x26 + # A[3] * B[1] + mul x25, x20, x9 + adcs x4, x4, x25 + umulh x26, x20, x9 + adcs x5, x5, x26 + adc x6, x6, xzr + # A[2] * B[2] + mul x25, x19, x10 + adds x4, x4, x25 + umulh x26, x19, x10 + adcs x5, x5, x26 + # A[3] * B[3] + mul x25, x20, x11 + adcs x6, x6, x25 + umulh x7, x20, x11 + adc x7, x7, xzr + # A[0] * B[3] + mul x25, x16, x11 + adds x24, x24, x25 + umulh x26, x16, x11 + adcs x4, x4, x26 + # A[2] * B[3] + mul x25, x19, x11 + adcs x5, x5, x25 + umulh x26, x19, x11 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[3] * B[0] + mul x25, x20, x8 + adds x24, x24, x25 + umulh x26, x20, x8 + adcs x4, x4, x26 + # A[3] * B[2] + mul x25, x20, x10 + adcs x5, x5, x25 + umulh x26, x20, x10 + adcs x6, x6, x26 + adc x7, x7, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x15, #63 - and x15, x15, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x12, x12, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x13, x13, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x14, x14, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x15, x15, x26 + mov x25, #38 + mul x26, x25, x7 + adds x24, x24, x26 + umulh x27, x25, x7 adc x27, x27, xzr - # Add remaining product results in - adds x13, x13, x16 - adcs x14, x14, x17 - adcs x15, x15, x19 - adc x27, x27, xzr - # Overflow - extr x27, x27, x15, #63 + mov x25, #19 + extr x27, x27, x24, #63 mul x27, x27, x25 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr - # Reduce if top bit set - and x27, x25, x15, asr 63 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr - # Store - ldr x0, [x29, #24] - ldr x1, [x29, #184] + and x24, x24, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x4 + adds x21, x21, x26 + umulh x4, x25, x4 + mul x26, x25, x5 + adcs x22, x22, x26 + umulh x5, x25, x5 + mul x26, x25, x6 + adcs x23, x23, x26 + umulh x6, x25, x6 + adc x24, x24, xzr + # Add high product results in + adds x21, x21, x27 + adcs x22, x22, x4 + adcs x23, x23, x5 + adc x24, x24, x6 + add x2, x2, #32 + add x1, x0, #32 + add x0, x0, #32 # Multiply - ldp x21, x22, [x1] - ldp x23, x24, [x1, #16] - # A[0] * B[0] - mul x4, x8, x21 - umulh x5, x8, x21 - # A[0] * B[1] - mul x25, x8, x22 - umulh x6, x8, x22 + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * B[0] + umulh x5, x12, x16 + mul x4, x12, x16 + # A[2] * B[0] + umulh x7, x14, x16 + mul x6, x14, x16 + # A[1] * B[0] + mul x25, x13, x16 adds x5, x5, x25 - adc x6, x6, xzr - # A[1] * B[0] - mul x25, x9, x21 - umulh x26, x9, x21 + umulh x26, x13, x16 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[1] * B[3] + umulh x9, x13, x20 + mul x8, x13, x20 + # A[0] * B[1] + mul x25, x12, x17 adds x5, x5, x25 + umulh x26, x12, x17 adcs x6, x6, x26 - adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x8, x23 - umulh x26, x8, x23 - adds x6, x6, x25 - adc x7, x7, x26 - # A[1] * B[1] - mul x25, x9, x22 - umulh x26, x9, x22 + # A[2] * B[1] + mul x25, x14, x17 + adcs x7, x7, x25 + umulh x26, x14, x17 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[2] + mul x25, x13, x19 + adds x7, x7, x25 + umulh x26, x13, x19 + adcs x8, x8, x26 + adcs x9, x9, xzr + adc x10, xzr, xzr + # A[0] * B[2] + mul x25, x12, x19 adds x6, x6, x25 + umulh x26, x12, x19 adcs x7, x7, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x10, x21 - umulh x26, x10, x21 + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + # A[1] * B[1] + mul x25, x13, x17 adds x6, x6, x25 + umulh x26, x13, x17 adcs x7, x7, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x8, x24 - umulh x26, x8, x24 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x9, x23 - umulh x26, x9, x23 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x10, x22 - umulh x26, x10, x22 + # A[3] * B[1] + mul x25, x15, x17 + adcs x8, x8, x25 + umulh x26, x15, x17 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[2] + mul x25, x14, x19 + adds x8, x8, x25 + umulh x26, x14, x19 + adcs x9, x9, x26 + # A[3] * B[3] + mul x25, x15, x20 + adcs x10, x10, x25 + umulh x11, x15, x20 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x12, x20 adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[3] * B[0] - mul x25, x11, x21 - umulh x26, x11, x21 + umulh x26, x12, x20 + adcs x8, x8, x26 + # A[2] * B[3] + mul x25, x14, x20 + adcs x9, x9, x25 + umulh x26, x14, x20 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x15, x16 adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x9, x24 - umulh x26, x9, x24 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x10, x23 - umulh x26, x10, x23 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[3] * B[1] - mul x25, x11, x22 - umulh x26, x11, x22 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x10, x24 - umulh x26, x10, x24 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x11, x23 - umulh x26, x11, x23 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, x20, xzr - # A[3] * B[3] - mul x25, x11, x24 - umulh x26, x11, x24 - adds x19, x19, x25 - adc x20, x20, x26 + umulh x26, x15, x16 + adcs x8, x8, x26 + # A[3] * B[2] + mul x25, x15, x19 + adcs x9, x9, x25 + umulh x26, x15, x19 + adcs x10, x10, x26 + adc x11, x11, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x4, x4, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x5, x5, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x6, x6, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x16 - adcs x6, x6, x17 - adcs x7, x7, x19 + mov x25, #38 + mul x26, x25, x11 + adds x7, x7, x26 + umulh x27, x25, x11 adc x27, x27, xzr - # Overflow + mov x25, #19 extr x27, x27, x7, #63 mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr + mov x25, #38 + mul x26, x25, x8 + adds x4, x4, x26 + umulh x8, x25, x8 + mul x26, x25, x9 + adcs x5, x5, x26 + umulh x9, x25, x9 + mul x26, x25, x10 + adcs x6, x6, x26 + umulh x10, x25, x10 adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff + # Add high product results in adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - ldr x0, [x29, #24] - ldr x1, [x29, #16] + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x7, x7, x10 + mov x3, x0 + sub x2, x0, #32 + sub x1, x0, #32 # Add - adds x8, x12, x4 - adcs x9, x13, x5 - adcs x10, x14, x6 - adc x11, x15, x7 - mov x25, #-19 - asr x28, x11, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + adds x8, x21, x4 + adcs x9, x22, x5 + adcs x10, x23, x6 + adcs x11, x24, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x11, #63 + mul x25, x28, x25 # Sub modulus (if overflow) - subs x8, x8, x25 - sbcs x9, x9, x28 - sbcs x10, x10, x28 - sbc x11, x11, x26 + adds x8, x8, x25 + adcs x9, x9, xzr + and x11, x11, #0x7fffffffffffffff + adcs x10, x10, xzr + adc x11, x11, xzr # Sub - subs x16, x12, x4 - sbcs x17, x13, x5 - sbcs x19, x14, x6 - sbcs x20, x15, x7 - mov x25, #-19 + subs x12, x21, x4 + sbcs x13, x22, x5 + sbcs x14, x23, x6 + sbcs x15, x24, x7 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + mov x25, #-19 + extr x28, x28, x15, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] - stp x16, x17, [x1] - stp x19, x20, [x1, #16] - ldr x0, [x29, #40] - ldr x1, [x29, #176] - ldr x3, [x29, #72] + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + ldr x1, [x29, #24] + ldr x2, [x29, #32] + add x2, x2, #0x40 + add x1, x1, #0x60 + add x0, x0, #0x40 # Multiply - ldp x16, x17, [x1] - ldp x19, x20, [x1, #16] - ldp x21, x22, [x3] - ldp x23, x24, [x3, #16] - # A[0] * B[0] - mul x4, x16, x21 - umulh x5, x16, x21 - # A[0] * B[1] - mul x25, x16, x22 - umulh x6, x16, x22 - adds x5, x5, x25 - adc x6, x6, xzr - # A[1] * B[0] - mul x25, x17, x21 - umulh x26, x17, x21 - adds x5, x5, x25 - adcs x6, x6, x26 - adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x16, x23 - umulh x26, x16, x23 - adds x6, x6, x25 - adc x7, x7, x26 - # A[1] * B[1] - mul x25, x17, x22 - umulh x26, x17, x22 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x8, xzr, xzr - # A[2] * B[0] - mul x25, x19, x21 - umulh x26, x19, x21 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x8, x8, xzr - # A[0] * B[3] - mul x25, x16, x24 - umulh x26, x16, x24 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, xzr, xzr - # A[1] * B[2] - mul x25, x17, x23 - umulh x26, x17, x23 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, x9, xzr - # A[2] * B[1] - mul x25, x19, x22 - umulh x26, x19, x22 - adds x7, x7, x25 + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + # A[0] * B[0] + umulh x17, x21, x4 + mul x16, x21, x4 + # A[2] * B[0] + umulh x20, x23, x4 + mul x19, x23, x4 + # A[1] * B[0] + mul x25, x22, x4 + adds x17, x17, x25 + umulh x26, x22, x4 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[1] * B[3] + umulh x9, x22, x7 + mul x8, x22, x7 + # A[0] * B[1] + mul x25, x21, x5 + adds x17, x17, x25 + umulh x26, x21, x5 + adcs x19, x19, x26 + # A[2] * B[1] + mul x25, x23, x5 + adcs x20, x20, x25 + umulh x26, x23, x5 adcs x8, x8, x26 adc x9, x9, xzr - # A[3] * B[0] - mul x25, x20, x21 - umulh x26, x20, x21 - adds x7, x7, x25 + # A[1] * B[2] + mul x25, x22, x6 + adds x20, x20, x25 + umulh x26, x22, x6 adcs x8, x8, x26 - adc x9, x9, xzr - # A[1] * B[3] - mul x25, x17, x24 - umulh x26, x17, x24 - adds x8, x8, x25 - adcs x9, x9, x26 + adcs x9, x9, xzr adc x10, xzr, xzr - # A[2] * B[2] - mul x25, x19, x23 - umulh x26, x19, x23 - adds x8, x8, x25 + # A[0] * B[2] + mul x25, x21, x6 + adds x19, x19, x25 + umulh x26, x21, x6 + adcs x20, x20, x26 + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + # A[1] * B[1] + mul x25, x22, x5 + adds x19, x19, x25 + umulh x26, x22, x5 + adcs x20, x20, x26 + # A[3] * B[1] + mul x25, x24, x5 + adcs x8, x8, x25 + umulh x26, x24, x5 adcs x9, x9, x26 adc x10, x10, xzr - # A[3] * B[1] - mul x25, x20, x22 - umulh x26, x20, x22 + # A[2] * B[2] + mul x25, x23, x6 adds x8, x8, x25 + umulh x26, x23, x6 adcs x9, x9, x26 - adc x10, x10, xzr - # A[2] * B[3] - mul x25, x19, x24 - umulh x26, x19, x24 - adds x9, x9, x25 + # A[3] * B[3] + mul x25, x24, x7 + adcs x10, x10, x25 + umulh x11, x24, x7 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x21, x7 + adds x20, x20, x25 + umulh x26, x21, x7 + adcs x8, x8, x26 + # A[2] * B[3] + mul x25, x23, x7 + adcs x9, x9, x25 + umulh x26, x23, x7 adcs x10, x10, x26 - adc x11, xzr, xzr - # A[3] * B[2] - mul x25, x20, x23 - umulh x26, x20, x23 - adds x9, x9, x25 + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x24, x4 + adds x20, x20, x25 + umulh x26, x24, x4 + adcs x8, x8, x26 + # A[3] * B[2] + mul x25, x24, x6 + adcs x9, x9, x25 + umulh x26, x24, x6 adcs x10, x10, x26 adc x11, x11, xzr - # A[3] * B[3] - mul x25, x20, x24 - umulh x26, x20, x24 - adds x10, x10, x25 - adc x11, x11, x26 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 + mov x25, #38 + mul x26, x25, x11 + adds x20, x20, x26 + umulh x27, x25, x11 + adc x27, x27, xzr mov x25, #19 + extr x27, x27, x20, #63 + mul x27, x27, x25 + and x20, x20, #0x7fffffffffffffff + mov x25, #38 mul x26, x25, x8 + adds x16, x16, x26 umulh x8, x25, x8 - adds x4, x4, x26 mul x26, x25, x9 + adcs x17, x17, x26 umulh x9, x25, x9 - adcs x5, x5, x26 mul x26, x25, x10 + adcs x19, x19, x26 umulh x10, x25, x10 - adcs x6, x6, x26 - mul x26, x25, x11 - umulh x27, x25, x11 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 - mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - ldr x0, [x29, #32] - ldr x1, [x29, #64] + adc x20, x20, xzr + # Add high product results in + adds x16, x16, x27 + adcs x17, x17, x8 + adcs x19, x19, x9 + adc x20, x20, x10 + sub x1, x1, #32 # Double - ldp x8, x9, [x1] - ldp x10, x11, [x1, #16] - adds x8, x8, x8 - adcs x9, x9, x9 - adcs x10, x10, x10 - adc x11, x11, x11 - mov x25, #-19 - asr x28, x11, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x8, x8, x25 - sbcs x9, x9, x28 - sbcs x10, x10, x28 - sbc x11, x11, x26 - ldr x1, [x29, #40] - # Add - adds x12, x8, x4 - adcs x13, x9, x5 - adcs x14, x10, x6 - adc x15, x11, x7 + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adc x15, x15, x15 mov x25, #-19 asr x28, x15, #63 # Mask the modulus @@ -6006,1108 +6465,1154 @@ _fe_ge_msub: sbcs x13, x13, x28 sbcs x14, x14, x28 sbc x15, x15, x26 + mov x3, x0 + sub x2, x0, #32 + mov x1, x0 + sub x0, x0, #32 + # Add + adds x8, x12, x16 + adcs x9, x13, x17 + adcs x10, x14, x19 + adcs x11, x15, x20 + cset x28, cs + mov x25, #19 + extr x28, x28, x11, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x8, x8, x25 + adcs x9, x9, xzr + and x11, x11, #0x7fffffffffffffff + adcs x10, x10, xzr + adc x11, x11, xzr # Sub - subs x16, x8, x4 - sbcs x17, x9, x5 - sbcs x19, x10, x6 - sbcs x20, x11, x7 - mov x25, #-19 + subs x4, x12, x16 + sbcs x5, x13, x17 + sbcs x6, x14, x19 + sbcs x7, x15, x20 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + mov x25, #-19 + extr x28, x28, x7, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 - stp x12, x13, [x1] - stp x14, x15, [x1, #16] - stp x16, x17, [x0] - stp x19, x20, [x0, #16] - ldr x17, [x29, #88] - ldr x19, [x29, #96] - ldp x20, x21, [x29, #104] - ldp x22, x23, [x29, #120] - ldp x24, x25, [x29, #136] - ldp x26, x27, [x29, #152] - ldr x28, [x29, #168] - ldp x29, x30, [sp], #0xb0 + subs x4, x4, x25 + sbcs x5, x5, xzr + and x7, x7, #0x7fffffffffffffff + sbcs x6, x6, xzr + sbc x7, x7, xzr + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x17, [x29, #56] + ldr x19, [x29, #64] + ldp x20, x21, [x29, #72] + ldp x22, x23, [x29, #88] + ldp x24, x25, [x29, #104] + ldp x26, x27, [x29, #120] + ldr x28, [x29, #136] + ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ - .size fe_ge_msub,.-fe_ge_msub + .size ge_madd,.-ge_madd #endif /* __APPLE__ */ #ifndef __APPLE__ .text -.globl fe_ge_add -.type fe_ge_add,@function +.globl ge_msub +.type ge_msub,@function .align 2 -fe_ge_add: +ge_msub: #else .section __TEXT,__text -.globl _fe_ge_add +.globl _ge_msub .p2align 2 -_fe_ge_add: +_ge_msub: #endif /* __APPLE__ */ - stp x29, x30, [sp, #-176]! + stp x29, x30, [sp, #-144]! add x29, sp, #0 - str x17, [x29, #88] - str x19, [x29, #96] - stp x20, x21, [x29, #104] - stp x22, x23, [x29, #120] - stp x24, x25, [x29, #136] - stp x26, x27, [x29, #152] - str x28, [x29, #168] + str x17, [x29, #56] + str x19, [x29, #64] + stp x20, x21, [x29, #72] + stp x22, x23, [x29, #88] + stp x24, x25, [x29, #104] + stp x26, x27, [x29, #120] + str x28, [x29, #136] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] - str x3, [x29, #40] - str x4, [x29, #48] - str x5, [x29, #56] - str x6, [x29, #64] - str x7, [x29, #72] - ldr x2, [x29, #56] - ldr x3, [x29, #48] + mov x3, x1 + add x2, x1, #32 + add x1, x0, #32 # Add - ldp x12, x13, [x2] - ldp x14, x15, [x2, #16] - ldp x16, x17, [x3] - ldp x19, x20, [x3, #16] - adds x4, x12, x16 - adcs x5, x13, x17 - adcs x6, x14, x19 - adc x7, x15, x20 - mov x25, #-19 - asr x28, x7, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + adds x16, x8, x4 + adcs x17, x9, x5 + adcs x19, x10, x6 + adcs x20, x11, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x20, #63 + mul x25, x28, x25 # Sub modulus (if overflow) - subs x4, x4, x25 - sbcs x5, x5, x28 - sbcs x6, x6, x28 - sbc x7, x7, x26 + adds x16, x16, x25 + adcs x17, x17, xzr + and x20, x20, #0x7fffffffffffffff + adcs x19, x19, xzr + adc x20, x20, xzr # Sub - subs x8, x12, x16 - sbcs x9, x13, x17 - sbcs x10, x14, x19 - sbcs x11, x15, x20 - mov x25, #-19 + subs x12, x8, x4 + sbcs x13, x9, x5 + sbcs x14, x10, x6 + sbcs x15, x11, x7 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + mov x25, #-19 + extr x28, x28, x15, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x8, x8, x25 - adcs x9, x9, x28 - adcs x10, x10, x28 - adc x11, x11, x26 - ldr x0, [x29, #32] - ldr x2, [x29, #192] + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + ldr x2, [x29, #32] + add x2, x2, #32 + mov x1, x0 # Multiply - ldp x21, x22, [x2] - ldp x23, x24, [x2, #16] - # A[0] * B[0] - mul x12, x4, x21 - umulh x13, x4, x21 - # A[0] * B[1] - mul x25, x4, x22 - umulh x14, x4, x22 - adds x13, x13, x25 - adc x14, x14, xzr - # A[1] * B[0] - mul x25, x5, x21 - umulh x26, x5, x21 - adds x13, x13, x25 - adcs x14, x14, x26 - adc x15, xzr, xzr - # A[0] * B[2] - mul x25, x4, x23 - umulh x26, x4, x23 - adds x14, x14, x25 - adc x15, x15, x26 - # A[1] * B[1] - mul x25, x5, x22 - umulh x26, x5, x22 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x6, x21 - umulh x26, x6, x21 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x4, x24 - umulh x26, x4, x24 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x5, x23 - umulh x26, x5, x23 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x6, x22 - umulh x26, x6, x22 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[3] * B[0] - mul x25, x7, x21 - umulh x26, x7, x21 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x5, x24 - umulh x26, x5, x24 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x6, x23 - umulh x26, x6, x23 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[3] * B[1] - mul x25, x7, x22 - umulh x26, x7, x22 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x6, x24 - umulh x26, x6, x24 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x7, x23 - umulh x26, x7, x23 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, x20, xzr - # A[3] * B[3] - mul x25, x7, x24 - umulh x26, x7, x24 - adds x19, x19, x25 - adc x20, x20, x26 + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + # A[0] * B[0] + umulh x22, x16, x8 + mul x21, x16, x8 + # A[2] * B[0] + umulh x24, x19, x8 + mul x23, x19, x8 + # A[1] * B[0] + mul x25, x17, x8 + adds x22, x22, x25 + umulh x26, x17, x8 + adcs x23, x23, x26 + adc x24, x24, xzr + # A[1] * B[3] + umulh x5, x17, x11 + mul x4, x17, x11 + # A[0] * B[1] + mul x25, x16, x9 + adds x22, x22, x25 + umulh x26, x16, x9 + adcs x23, x23, x26 + # A[2] * B[1] + mul x25, x19, x9 + adcs x24, x24, x25 + umulh x26, x19, x9 + adcs x4, x4, x26 + adc x5, x5, xzr + # A[1] * B[2] + mul x25, x17, x10 + adds x24, x24, x25 + umulh x26, x17, x10 + adcs x4, x4, x26 + adcs x5, x5, xzr + adc x6, xzr, xzr + # A[0] * B[2] + mul x25, x16, x10 + adds x23, x23, x25 + umulh x26, x16, x10 + adcs x24, x24, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # A[1] * B[1] + mul x25, x17, x9 + adds x23, x23, x25 + umulh x26, x17, x9 + adcs x24, x24, x26 + # A[3] * B[1] + mul x25, x20, x9 + adcs x4, x4, x25 + umulh x26, x20, x9 + adcs x5, x5, x26 + adc x6, x6, xzr + # A[2] * B[2] + mul x25, x19, x10 + adds x4, x4, x25 + umulh x26, x19, x10 + adcs x5, x5, x26 + # A[3] * B[3] + mul x25, x20, x11 + adcs x6, x6, x25 + umulh x7, x20, x11 + adc x7, x7, xzr + # A[0] * B[3] + mul x25, x16, x11 + adds x24, x24, x25 + umulh x26, x16, x11 + adcs x4, x4, x26 + # A[2] * B[3] + mul x25, x19, x11 + adcs x5, x5, x25 + umulh x26, x19, x11 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[3] * B[0] + mul x25, x20, x8 + adds x24, x24, x25 + umulh x26, x20, x8 + adcs x4, x4, x26 + # A[3] * B[2] + mul x25, x20, x10 + adcs x5, x5, x25 + umulh x26, x20, x10 + adcs x6, x6, x26 + adc x7, x7, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x15, #63 - and x15, x15, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x12, x12, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x13, x13, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x14, x14, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x15, x15, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x13, x13, x16 - adcs x14, x14, x17 - adcs x15, x15, x19 + mov x25, #38 + mul x26, x25, x7 + adds x24, x24, x26 + umulh x27, x25, x7 adc x27, x27, xzr - # Overflow - extr x27, x27, x15, #63 + mov x25, #19 + extr x27, x27, x24, #63 mul x27, x27, x25 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr - # Reduce if top bit set - and x27, x25, x15, asr 63 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr - # Store - ldr x0, [x29, #24] - ldr x1, [x29, #200] + and x24, x24, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x4 + adds x21, x21, x26 + umulh x4, x25, x4 + mul x26, x25, x5 + adcs x22, x22, x26 + umulh x5, x25, x5 + mul x26, x25, x6 + adcs x23, x23, x26 + umulh x6, x25, x6 + adc x24, x24, xzr + # Add high product results in + adds x21, x21, x27 + adcs x22, x22, x4 + adcs x23, x23, x5 + adc x24, x24, x6 + sub x2, x2, #32 + add x1, x0, #32 + add x0, x0, #32 # Multiply - ldp x21, x22, [x1] - ldp x23, x24, [x1, #16] - # A[0] * B[0] - mul x4, x8, x21 - umulh x5, x8, x21 - # A[0] * B[1] - mul x25, x8, x22 - umulh x6, x8, x22 + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * B[0] + umulh x5, x12, x16 + mul x4, x12, x16 + # A[2] * B[0] + umulh x7, x14, x16 + mul x6, x14, x16 + # A[1] * B[0] + mul x25, x13, x16 adds x5, x5, x25 - adc x6, x6, xzr - # A[1] * B[0] - mul x25, x9, x21 - umulh x26, x9, x21 + umulh x26, x13, x16 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[1] * B[3] + umulh x9, x13, x20 + mul x8, x13, x20 + # A[0] * B[1] + mul x25, x12, x17 adds x5, x5, x25 + umulh x26, x12, x17 adcs x6, x6, x26 - adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x8, x23 - umulh x26, x8, x23 - adds x6, x6, x25 - adc x7, x7, x26 - # A[1] * B[1] - mul x25, x9, x22 - umulh x26, x9, x22 + # A[2] * B[1] + mul x25, x14, x17 + adcs x7, x7, x25 + umulh x26, x14, x17 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[2] + mul x25, x13, x19 + adds x7, x7, x25 + umulh x26, x13, x19 + adcs x8, x8, x26 + adcs x9, x9, xzr + adc x10, xzr, xzr + # A[0] * B[2] + mul x25, x12, x19 adds x6, x6, x25 + umulh x26, x12, x19 adcs x7, x7, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x10, x21 - umulh x26, x10, x21 + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + # A[1] * B[1] + mul x25, x13, x17 adds x6, x6, x25 + umulh x26, x13, x17 adcs x7, x7, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x8, x24 - umulh x26, x8, x24 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x9, x23 - umulh x26, x9, x23 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x10, x22 - umulh x26, x10, x22 + # A[3] * B[1] + mul x25, x15, x17 + adcs x8, x8, x25 + umulh x26, x15, x17 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[2] + mul x25, x14, x19 + adds x8, x8, x25 + umulh x26, x14, x19 + adcs x9, x9, x26 + # A[3] * B[3] + mul x25, x15, x20 + adcs x10, x10, x25 + umulh x11, x15, x20 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x12, x20 adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[3] * B[0] - mul x25, x11, x21 - umulh x26, x11, x21 + umulh x26, x12, x20 + adcs x8, x8, x26 + # A[2] * B[3] + mul x25, x14, x20 + adcs x9, x9, x25 + umulh x26, x14, x20 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x15, x16 adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x9, x24 - umulh x26, x9, x24 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x10, x23 - umulh x26, x10, x23 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[3] * B[1] - mul x25, x11, x22 - umulh x26, x11, x22 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x10, x24 - umulh x26, x10, x24 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x11, x23 - umulh x26, x11, x23 - adds x17, x17, x25 - adcs x19, x19, x26 - adc x20, x20, xzr - # A[3] * B[3] - mul x25, x11, x24 - umulh x26, x11, x24 - adds x19, x19, x25 - adc x20, x20, x26 + umulh x26, x15, x16 + adcs x8, x8, x26 + # A[3] * B[2] + mul x25, x15, x19 + adcs x9, x9, x25 + umulh x26, x15, x19 + adcs x10, x10, x26 + adc x11, x11, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x4, x4, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x5, x5, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x6, x6, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x16 - adcs x6, x6, x17 - adcs x7, x7, x19 + mov x25, #38 + mul x26, x25, x11 + adds x7, x7, x26 + umulh x27, x25, x11 adc x27, x27, xzr - # Overflow + mov x25, #19 extr x27, x27, x7, #63 mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr + mov x25, #38 + mul x26, x25, x8 + adds x4, x4, x26 + umulh x8, x25, x8 + mul x26, x25, x9 + adcs x5, x5, x26 + umulh x9, x25, x9 + mul x26, x25, x10 + adcs x6, x6, x26 + umulh x10, x25, x10 adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff + # Add high product results in adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - ldr x0, [x29, #24] - ldr x1, [x29, #16] + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x7, x7, x10 + mov x3, x0 + sub x2, x0, #32 + sub x1, x0, #32 # Add - adds x8, x12, x4 - adcs x9, x13, x5 - adcs x10, x14, x6 - adc x11, x15, x7 + adds x8, x21, x4 + adcs x9, x22, x5 + adcs x10, x23, x6 + adcs x11, x24, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x11, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x8, x8, x25 + adcs x9, x9, xzr + and x11, x11, #0x7fffffffffffffff + adcs x10, x10, xzr + adc x11, x11, xzr + # Sub + subs x12, x21, x4 + sbcs x13, x22, x5 + sbcs x14, x23, x6 + sbcs x15, x24, x7 + csetm x28, cc mov x25, #-19 - asr x28, x11, #63 + extr x28, x28, x15, #63 + mul x25, x28, x25 + # Add modulus (if underflow) + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + ldr x1, [x29, #24] + ldr x2, [x29, #32] + add x2, x2, #0x40 + add x1, x1, #0x60 + add x0, x0, #0x40 + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + # A[0] * B[0] + umulh x17, x21, x4 + mul x16, x21, x4 + # A[2] * B[0] + umulh x20, x23, x4 + mul x19, x23, x4 + # A[1] * B[0] + mul x25, x22, x4 + adds x17, x17, x25 + umulh x26, x22, x4 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[1] * B[3] + umulh x9, x22, x7 + mul x8, x22, x7 + # A[0] * B[1] + mul x25, x21, x5 + adds x17, x17, x25 + umulh x26, x21, x5 + adcs x19, x19, x26 + # A[2] * B[1] + mul x25, x23, x5 + adcs x20, x20, x25 + umulh x26, x23, x5 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[2] + mul x25, x22, x6 + adds x20, x20, x25 + umulh x26, x22, x6 + adcs x8, x8, x26 + adcs x9, x9, xzr + adc x10, xzr, xzr + # A[0] * B[2] + mul x25, x21, x6 + adds x19, x19, x25 + umulh x26, x21, x6 + adcs x20, x20, x26 + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + # A[1] * B[1] + mul x25, x22, x5 + adds x19, x19, x25 + umulh x26, x22, x5 + adcs x20, x20, x26 + # A[3] * B[1] + mul x25, x24, x5 + adcs x8, x8, x25 + umulh x26, x24, x5 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[2] + mul x25, x23, x6 + adds x8, x8, x25 + umulh x26, x23, x6 + adcs x9, x9, x26 + # A[3] * B[3] + mul x25, x24, x7 + adcs x10, x10, x25 + umulh x11, x24, x7 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x21, x7 + adds x20, x20, x25 + umulh x26, x21, x7 + adcs x8, x8, x26 + # A[2] * B[3] + mul x25, x23, x7 + adcs x9, x9, x25 + umulh x26, x23, x7 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x24, x4 + adds x20, x20, x25 + umulh x26, x24, x4 + adcs x8, x8, x26 + # A[3] * B[2] + mul x25, x24, x6 + adcs x9, x9, x25 + umulh x26, x24, x6 + adcs x10, x10, x26 + adc x11, x11, xzr + # Reduce + mov x25, #38 + mul x26, x25, x11 + adds x20, x20, x26 + umulh x27, x25, x11 + adc x27, x27, xzr + mov x25, #19 + extr x27, x27, x20, #63 + mul x27, x27, x25 + and x20, x20, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x8 + adds x16, x16, x26 + umulh x8, x25, x8 + mul x26, x25, x9 + adcs x17, x17, x26 + umulh x9, x25, x9 + mul x26, x25, x10 + adcs x19, x19, x26 + umulh x10, x25, x10 + adc x20, x20, xzr + # Add high product results in + adds x16, x16, x27 + adcs x17, x17, x8 + adcs x19, x19, x9 + adc x20, x20, x10 + sub x1, x1, #32 + # Double + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adc x15, x15, x15 + mov x25, #-19 + asr x28, x15, #63 # Mask the modulus and x25, x28, x25 and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x8, x8, x25 - sbcs x9, x9, x28 - sbcs x10, x10, x28 - sbc x11, x11, x26 + subs x12, x12, x25 + sbcs x13, x13, x28 + sbcs x14, x14, x28 + sbc x15, x15, x26 + mov x3, x0 + sub x2, x0, #32 + sub x1, x0, #32 + # Add + adds x8, x12, x16 + adcs x9, x13, x17 + adcs x10, x14, x19 + adcs x11, x15, x20 + cset x28, cs + mov x25, #19 + extr x28, x28, x11, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x8, x8, x25 + adcs x9, x9, xzr + and x11, x11, #0x7fffffffffffffff + adcs x10, x10, xzr + adc x11, x11, xzr # Sub - subs x16, x12, x4 - sbcs x17, x13, x5 - sbcs x19, x14, x6 - sbcs x20, x15, x7 - mov x25, #-19 + subs x4, x12, x16 + sbcs x5, x13, x17 + sbcs x6, x14, x19 + sbcs x7, x15, x20 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + mov x25, #-19 + extr x28, x28, x7, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 + subs x4, x4, x25 + sbcs x5, x5, xzr + and x7, x7, #0x7fffffffffffffff + sbcs x6, x6, xzr + sbc x7, x7, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] - stp x16, x17, [x1] - stp x19, x20, [x1, #16] - ldr x0, [x29, #48] - ldr x1, [x29, #64] - ldr x2, [x29, #176] + stp x4, x5, [x1] + stp x6, x7, [x1, #16] + ldr x17, [x29, #56] + ldr x19, [x29, #64] + ldp x20, x21, [x29, #72] + ldp x22, x23, [x29, #88] + ldp x24, x25, [x29, #104] + ldp x26, x27, [x29, #120] + ldr x28, [x29, #136] + ldp x29, x30, [sp], #0x90 + ret +#ifndef __APPLE__ + .size ge_msub,.-ge_msub +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_add +.type ge_add,@function +.align 2 +ge_add: +#else +.section __TEXT,__text +.globl _ge_add +.p2align 2 +_ge_add: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + str x17, [x29, #56] + str x19, [x29, #64] + stp x20, x21, [x29, #72] + stp x22, x23, [x29, #88] + stp x24, x25, [x29, #104] + stp x26, x27, [x29, #120] + str x28, [x29, #136] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + mov x3, x1 + add x2, x1, #32 + add x1, x0, #32 + # Add + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + adds x16, x8, x4 + adcs x17, x9, x5 + adcs x19, x10, x6 + adcs x20, x11, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x20, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x16, x16, x25 + adcs x17, x17, xzr + and x20, x20, #0x7fffffffffffffff + adcs x19, x19, xzr + adc x20, x20, xzr + # Sub + subs x12, x8, x4 + sbcs x13, x9, x5 + sbcs x14, x10, x6 + sbcs x15, x11, x7 + csetm x28, cc + mov x25, #-19 + extr x28, x28, x15, #63 + mul x25, x28, x25 + # Add modulus (if underflow) + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + ldr x2, [x29, #32] + mov x1, x0 + # Multiply + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + # A[0] * B[0] + umulh x22, x16, x8 + mul x21, x16, x8 + # A[2] * B[0] + umulh x24, x19, x8 + mul x23, x19, x8 + # A[1] * B[0] + mul x25, x17, x8 + adds x22, x22, x25 + umulh x26, x17, x8 + adcs x23, x23, x26 + adc x24, x24, xzr + # A[1] * B[3] + umulh x5, x17, x11 + mul x4, x17, x11 + # A[0] * B[1] + mul x25, x16, x9 + adds x22, x22, x25 + umulh x26, x16, x9 + adcs x23, x23, x26 + # A[2] * B[1] + mul x25, x19, x9 + adcs x24, x24, x25 + umulh x26, x19, x9 + adcs x4, x4, x26 + adc x5, x5, xzr + # A[1] * B[2] + mul x25, x17, x10 + adds x24, x24, x25 + umulh x26, x17, x10 + adcs x4, x4, x26 + adcs x5, x5, xzr + adc x6, xzr, xzr + # A[0] * B[2] + mul x25, x16, x10 + adds x23, x23, x25 + umulh x26, x16, x10 + adcs x24, x24, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # A[1] * B[1] + mul x25, x17, x9 + adds x23, x23, x25 + umulh x26, x17, x9 + adcs x24, x24, x26 + # A[3] * B[1] + mul x25, x20, x9 + adcs x4, x4, x25 + umulh x26, x20, x9 + adcs x5, x5, x26 + adc x6, x6, xzr + # A[2] * B[2] + mul x25, x19, x10 + adds x4, x4, x25 + umulh x26, x19, x10 + adcs x5, x5, x26 + # A[3] * B[3] + mul x25, x20, x11 + adcs x6, x6, x25 + umulh x7, x20, x11 + adc x7, x7, xzr + # A[0] * B[3] + mul x25, x16, x11 + adds x24, x24, x25 + umulh x26, x16, x11 + adcs x4, x4, x26 + # A[2] * B[3] + mul x25, x19, x11 + adcs x5, x5, x25 + umulh x26, x19, x11 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[3] * B[0] + mul x25, x20, x8 + adds x24, x24, x25 + umulh x26, x20, x8 + adcs x4, x4, x26 + # A[3] * B[2] + mul x25, x20, x10 + adcs x5, x5, x25 + umulh x26, x20, x10 + adcs x6, x6, x26 + adc x7, x7, xzr + # Reduce + mov x25, #38 + mul x26, x25, x7 + adds x24, x24, x26 + umulh x27, x25, x7 + adc x27, x27, xzr + mov x25, #19 + extr x27, x27, x24, #63 + mul x27, x27, x25 + and x24, x24, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x4 + adds x21, x21, x26 + umulh x4, x25, x4 + mul x26, x25, x5 + adcs x22, x22, x26 + umulh x5, x25, x5 + mul x26, x25, x6 + adcs x23, x23, x26 + umulh x6, x25, x6 + adc x24, x24, xzr + # Add high product results in + adds x21, x21, x27 + adcs x22, x22, x4 + adcs x23, x23, x5 + adc x24, x24, x6 + # Store + stp x21, x22, [x0] + stp x23, x24, [x0, #16] + add x2, x2, #32 + add x1, x0, #32 + add x0, x0, #32 # Multiply - ldp x12, x13, [x1] - ldp x14, x15, [x1, #16] ldp x16, x17, [x2] ldp x19, x20, [x2, #16] - # A[0] * B[0] - mul x4, x12, x16 + # A[0] * B[0] umulh x5, x12, x16 - # A[0] * B[1] - mul x25, x12, x17 - umulh x6, x12, x17 - adds x5, x5, x25 - adc x6, x6, xzr - # A[1] * B[0] + mul x4, x12, x16 + # A[2] * B[0] + umulh x7, x14, x16 + mul x6, x14, x16 + # A[1] * B[0] mul x25, x13, x16 + adds x5, x5, x25 umulh x26, x13, x16 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[1] * B[3] + umulh x9, x13, x20 + mul x8, x13, x20 + # A[0] * B[1] + mul x25, x12, x17 adds x5, x5, x25 + umulh x26, x12, x17 adcs x6, x6, x26 - adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x12, x19 - umulh x26, x12, x19 - adds x6, x6, x25 - adc x7, x7, x26 - # A[1] * B[1] - mul x25, x13, x17 - umulh x26, x13, x17 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x8, xzr, xzr - # A[2] * B[0] - mul x25, x14, x16 - umulh x26, x14, x16 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x8, x8, xzr - # A[0] * B[3] - mul x25, x12, x20 - umulh x26, x12, x20 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, xzr, xzr - # A[1] * B[2] - mul x25, x13, x19 - umulh x26, x13, x19 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, x9, xzr - # A[2] * B[1] + # A[2] * B[1] mul x25, x14, x17 + adcs x7, x7, x25 umulh x26, x14, x17 - adds x7, x7, x25 adcs x8, x8, x26 adc x9, x9, xzr - # A[3] * B[0] - mul x25, x15, x16 - umulh x26, x15, x16 + # A[1] * B[2] + mul x25, x13, x19 adds x7, x7, x25 + umulh x26, x13, x19 adcs x8, x8, x26 - adc x9, x9, xzr - # A[1] * B[3] - mul x25, x13, x20 - umulh x26, x13, x20 - adds x8, x8, x25 - adcs x9, x9, x26 + adcs x9, x9, xzr adc x10, xzr, xzr - # A[2] * B[2] - mul x25, x14, x19 - umulh x26, x14, x19 - adds x8, x8, x25 - adcs x9, x9, x26 + # A[0] * B[2] + mul x25, x12, x19 + adds x6, x6, x25 + umulh x26, x12, x19 + adcs x7, x7, x26 + adcs x8, x8, xzr + adcs x9, x9, xzr adc x10, x10, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x25, x13, x17 + adds x6, x6, x25 + umulh x26, x13, x17 + adcs x7, x7, x26 + # A[3] * B[1] mul x25, x15, x17 + adcs x8, x8, x25 umulh x26, x15, x17 - adds x8, x8, x25 adcs x9, x9, x26 adc x10, x10, xzr - # A[2] * B[3] + # A[2] * B[2] + mul x25, x14, x19 + adds x8, x8, x25 + umulh x26, x14, x19 + adcs x9, x9, x26 + # A[3] * B[3] + mul x25, x15, x20 + adcs x10, x10, x25 + umulh x11, x15, x20 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x12, x20 + adds x7, x7, x25 + umulh x26, x12, x20 + adcs x8, x8, x26 + # A[2] * B[3] mul x25, x14, x20 + adcs x9, x9, x25 umulh x26, x14, x20 - adds x9, x9, x25 adcs x10, x10, x26 - adc x11, xzr, xzr - # A[3] * B[2] + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x15, x16 + adds x7, x7, x25 + umulh x26, x15, x16 + adcs x8, x8, x26 + # A[3] * B[2] mul x25, x15, x19 + adcs x9, x9, x25 umulh x26, x15, x19 - adds x9, x9, x25 adcs x10, x10, x26 adc x11, x11, xzr - # A[3] * B[3] - mul x25, x15, x20 - umulh x26, x15, x20 - adds x10, x10, x25 - adc x11, x11, x26 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 + mov x25, #38 + mul x26, x25, x11 + adds x7, x7, x26 + umulh x27, x25, x11 + adc x27, x27, xzr mov x25, #19 + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + mov x25, #38 mul x26, x25, x8 - umulh x8, x25, x8 adds x4, x4, x26 + umulh x8, x25, x8 mul x26, x25, x9 - umulh x9, x25, x9 adcs x5, x5, x26 + umulh x9, x25, x9 mul x26, x25, x10 - umulh x10, x25, x10 adcs x6, x6, x26 - mul x26, x25, x11 - umulh x27, x25, x11 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 - adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 - mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr + umulh x10, x25, x10 adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff + # Add high product results in adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x7, x7, x10 # Store - ldr x0, [x29, #48] - # Double - adds x4, x4, x4 - adcs x5, x5, x5 - adcs x6, x6, x6 - adc x7, x7, x7 - mov x25, #-19 - asr x28, x7, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x4, x4, x25 - sbcs x5, x5, x28 - sbcs x6, x6, x28 - sbc x7, x7, x26 - ldr x0, [x29, #40] - ldr x1, [x29, #184] - ldr x2, [x29, #72] - # Multiply - ldp x16, x17, [x1] - ldp x19, x20, [x1, #16] - ldp x21, x22, [x2] - ldp x23, x24, [x2, #16] - # A[0] * B[0] - mul x8, x16, x21 - umulh x9, x16, x21 - # A[0] * B[1] - mul x25, x16, x22 - umulh x10, x16, x22 - adds x9, x9, x25 - adc x10, x10, xzr - # A[1] * B[0] - mul x25, x17, x21 - umulh x26, x17, x21 - adds x9, x9, x25 - adcs x10, x10, x26 - adc x11, xzr, xzr - # A[0] * B[2] - mul x25, x16, x23 - umulh x26, x16, x23 - adds x10, x10, x25 - adc x11, x11, x26 - # A[1] * B[1] - mul x25, x17, x22 - umulh x26, x17, x22 - adds x10, x10, x25 - adcs x11, x11, x26 - adc x12, xzr, xzr - # A[2] * B[0] - mul x25, x19, x21 - umulh x26, x19, x21 - adds x10, x10, x25 - adcs x11, x11, x26 - adc x12, x12, xzr - # A[0] * B[3] - mul x25, x16, x24 - umulh x26, x16, x24 - adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, xzr, xzr - # A[1] * B[2] - mul x25, x17, x23 - umulh x26, x17, x23 - adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, x13, xzr - # A[2] * B[1] - mul x25, x19, x22 - umulh x26, x19, x22 - adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, x13, xzr - # A[3] * B[0] - mul x25, x20, x21 - umulh x26, x20, x21 - adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, x13, xzr - # A[1] * B[3] - mul x25, x17, x24 - umulh x26, x17, x24 - adds x12, x12, x25 - adcs x13, x13, x26 - adc x14, xzr, xzr - # A[2] * B[2] - mul x25, x19, x23 - umulh x26, x19, x23 - adds x12, x12, x25 - adcs x13, x13, x26 - adc x14, x14, xzr - # A[3] * B[1] - mul x25, x20, x22 - umulh x26, x20, x22 - adds x12, x12, x25 - adcs x13, x13, x26 - adc x14, x14, xzr - # A[2] * B[3] - mul x25, x19, x24 - umulh x26, x19, x24 - adds x13, x13, x25 - adcs x14, x14, x26 - adc x15, xzr, xzr - # A[3] * B[2] - mul x25, x20, x23 - umulh x26, x20, x23 - adds x13, x13, x25 - adcs x14, x14, x26 - adc x15, x15, xzr - # A[3] * B[3] - mul x25, x20, x24 - umulh x26, x20, x24 - adds x14, x14, x25 - adc x15, x15, x26 - # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x15, x15, x14, #63 - extr x14, x14, x13, #63 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - and x11, x11, #0x7fffffffffffffff - # Multiply top half by 19 + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + mov x3, x0 + sub x2, x0, #32 + sub x1, x0, #32 + # Add + adds x8, x21, x4 + adcs x9, x22, x5 + adcs x10, x23, x6 + adcs x11, x24, x7 + cset x28, cs mov x25, #19 - mul x26, x25, x12 - umulh x12, x25, x12 - adds x8, x8, x26 - mul x26, x25, x13 - umulh x13, x25, x13 - adcs x9, x9, x26 - mul x26, x25, x14 - umulh x14, x25, x14 - adcs x10, x10, x26 - mul x26, x25, x15 - umulh x27, x25, x15 - adcs x11, x11, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x9, x9, x12 - adcs x10, x10, x13 - adcs x11, x11, x14 - adc x27, x27, xzr - # Overflow - extr x27, x27, x11, #63 - mul x27, x27, x25 - and x11, x11, #0x7fffffffffffffff - adds x8, x8, x27 + extr x28, x28, x11, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x8, x8, x25 adcs x9, x9, xzr - adcs x10, x10, xzr - adc x11, x11, xzr - # Reduce if top bit set - and x27, x25, x11, asr 63 and x11, x11, #0x7fffffffffffffff - adds x8, x8, x27 - adcs x9, x9, xzr adcs x10, x10, xzr adc x11, x11, xzr - # Store - ldr x0, [x29, #32] - ldr x1, [x29, #40] - # Add - adds x12, x4, x8 - adcs x13, x5, x9 - adcs x14, x6, x10 - adc x15, x7, x11 - mov x25, #-19 - asr x28, x15, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x12, x12, x25 - sbcs x13, x13, x28 - sbcs x14, x14, x28 - sbc x15, x15, x26 # Sub - subs x16, x4, x8 - sbcs x17, x5, x9 - sbcs x19, x6, x10 - sbcs x20, x7, x11 - mov x25, #-19 + subs x12, x21, x4 + sbcs x13, x22, x5 + sbcs x14, x23, x6 + sbcs x15, x24, x7 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 - stp x12, x13, [x0] - stp x14, x15, [x0, #16] - stp x16, x17, [x1] - stp x19, x20, [x1, #16] - ldr x17, [x29, #88] - ldr x19, [x29, #96] - ldp x20, x21, [x29, #104] - ldp x22, x23, [x29, #120] - ldp x24, x25, [x29, #136] - ldp x26, x27, [x29, #152] - ldr x28, [x29, #168] - ldp x29, x30, [sp], #0xb0 - ret -#ifndef __APPLE__ - .size fe_ge_add,.-fe_ge_add -#endif /* __APPLE__ */ -#ifndef __APPLE__ -.text -.globl fe_ge_sub -.type fe_ge_sub,@function -.align 2 -fe_ge_sub: -#else -.section __TEXT,__text -.globl _fe_ge_sub -.p2align 2 -_fe_ge_sub: -#endif /* __APPLE__ */ - stp x29, x30, [sp, #-176]! - add x29, sp, #0 - str x17, [x29, #88] - str x19, [x29, #96] - stp x20, x21, [x29, #104] - stp x22, x23, [x29, #120] - stp x24, x25, [x29, #136] - stp x26, x27, [x29, #152] - str x28, [x29, #168] - str x0, [x29, #16] - str x1, [x29, #24] - str x2, [x29, #32] - str x3, [x29, #40] - str x4, [x29, #48] - str x5, [x29, #56] - str x6, [x29, #64] - str x7, [x29, #72] - ldr x2, [x29, #56] - ldr x3, [x29, #48] - # Add - ldp x12, x13, [x2] - ldp x14, x15, [x2, #16] - ldp x16, x17, [x3] - ldp x19, x20, [x3, #16] - adds x4, x12, x16 - adcs x5, x13, x17 - adcs x6, x14, x19 - adc x7, x15, x20 - mov x25, #-19 - asr x28, x7, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x4, x4, x25 - sbcs x5, x5, x28 - sbcs x6, x6, x28 - sbc x7, x7, x26 - # Sub - subs x8, x12, x16 - sbcs x9, x13, x17 - sbcs x10, x14, x19 - sbcs x11, x15, x20 mov x25, #-19 - csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + extr x28, x28, x15, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x8, x8, x25 - adcs x9, x9, x28 - adcs x10, x10, x28 - adc x11, x11, x26 - ldr x0, [x29, #32] - ldr x2, [x29, #200] + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + ldr x1, [x29, #24] + ldr x2, [x29, #32] + add x2, x2, #0x60 + add x1, x1, #0x60 + add x0, x0, #0x40 # Multiply - ldp x21, x22, [x2] - ldp x23, x24, [x2, #16] - # A[0] * B[0] - mul x12, x4, x21 - umulh x13, x4, x21 - # A[0] * B[1] - mul x25, x4, x22 - umulh x14, x4, x22 - adds x13, x13, x25 - adc x14, x14, xzr - # A[1] * B[0] - mul x25, x5, x21 - umulh x26, x5, x21 - adds x13, x13, x25 - adcs x14, x14, x26 - adc x15, xzr, xzr - # A[0] * B[2] - mul x25, x4, x23 - umulh x26, x4, x23 - adds x14, x14, x25 - adc x15, x15, x26 - # A[1] * B[1] - mul x25, x5, x22 - umulh x26, x5, x22 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x6, x21 - umulh x26, x6, x21 - adds x14, x14, x25 - adcs x15, x15, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x4, x24 - umulh x26, x4, x24 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x5, x23 - umulh x26, x5, x23 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x6, x22 - umulh x26, x6, x22 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[3] * B[0] - mul x25, x7, x21 - umulh x26, x7, x21 - adds x15, x15, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x5, x24 - umulh x26, x5, x24 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x6, x23 - umulh x26, x6, x23 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[3] * B[1] - mul x25, x7, x22 - umulh x26, x7, x22 - adds x16, x16, x25 - adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x6, x24 - umulh x26, x6, x24 + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + # A[0] * B[0] + umulh x17, x21, x4 + mul x16, x21, x4 + # A[2] * B[0] + umulh x20, x23, x4 + mul x19, x23, x4 + # A[1] * B[0] + mul x25, x22, x4 adds x17, x17, x25 + umulh x26, x22, x4 adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x7, x23 - umulh x26, x7, x23 + adc x20, x20, xzr + # A[1] * B[3] + umulh x9, x22, x7 + mul x8, x22, x7 + # A[0] * B[1] + mul x25, x21, x5 adds x17, x17, x25 + umulh x26, x21, x5 adcs x19, x19, x26 - adc x20, x20, xzr - # A[3] * B[3] - mul x25, x7, x24 - umulh x26, x7, x24 + # A[2] * B[1] + mul x25, x23, x5 + adcs x20, x20, x25 + umulh x26, x23, x5 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[2] + mul x25, x22, x6 + adds x20, x20, x25 + umulh x26, x22, x6 + adcs x8, x8, x26 + adcs x9, x9, xzr + adc x10, xzr, xzr + # A[0] * B[2] + mul x25, x21, x6 adds x19, x19, x25 - adc x20, x20, x26 + umulh x26, x21, x6 + adcs x20, x20, x26 + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + # A[1] * B[1] + mul x25, x22, x5 + adds x19, x19, x25 + umulh x26, x22, x5 + adcs x20, x20, x26 + # A[3] * B[1] + mul x25, x24, x5 + adcs x8, x8, x25 + umulh x26, x24, x5 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[2] + mul x25, x23, x6 + adds x8, x8, x25 + umulh x26, x23, x6 + adcs x9, x9, x26 + # A[3] * B[3] + mul x25, x24, x7 + adcs x10, x10, x25 + umulh x11, x24, x7 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x21, x7 + adds x20, x20, x25 + umulh x26, x21, x7 + adcs x8, x8, x26 + # A[2] * B[3] + mul x25, x23, x7 + adcs x9, x9, x25 + umulh x26, x23, x7 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x24, x4 + adds x20, x20, x25 + umulh x26, x24, x4 + adcs x8, x8, x26 + # A[3] * B[2] + mul x25, x24, x6 + adcs x9, x9, x25 + umulh x26, x24, x6 + adcs x10, x10, x26 + adc x11, x11, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x15, #63 - and x15, x15, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x16 - umulh x16, x25, x16 - adds x12, x12, x26 - mul x26, x25, x17 - umulh x17, x25, x17 - adcs x13, x13, x26 - mul x26, x25, x19 - umulh x19, x25, x19 - adcs x14, x14, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x15, x15, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x13, x13, x16 - adcs x14, x14, x17 - adcs x15, x15, x19 + mov x25, #38 + mul x26, x25, x11 + adds x20, x20, x26 + umulh x27, x25, x11 adc x27, x27, xzr - # Overflow - extr x27, x27, x15, #63 + mov x25, #19 + extr x27, x27, x20, #63 mul x27, x27, x25 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr - # Reduce if top bit set - and x27, x25, x15, asr 63 - and x15, x15, #0x7fffffffffffffff - adds x12, x12, x27 - adcs x13, x13, xzr - adcs x14, x14, xzr - adc x15, x15, xzr + and x20, x20, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x8 + adds x16, x16, x26 + umulh x8, x25, x8 + mul x26, x25, x9 + adcs x17, x17, x26 + umulh x9, x25, x9 + mul x26, x25, x10 + adcs x19, x19, x26 + umulh x10, x25, x10 + adc x20, x20, xzr + # Add high product results in + adds x16, x16, x27 + adcs x17, x17, x8 + adcs x19, x19, x9 + adc x20, x20, x10 # Store - ldr x0, [x29, #24] - ldr x1, [x29, #192] + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + sub x3, x2, #32 + sub x2, x1, #32 + sub x1, x0, #32 # Multiply - ldp x21, x22, [x1] - ldp x23, x24, [x1, #16] - # A[0] * B[0] - mul x4, x8, x21 - umulh x5, x8, x21 - # A[0] * B[1] - mul x25, x8, x22 - umulh x6, x8, x22 - adds x5, x5, x25 - adc x6, x6, xzr - # A[1] * B[0] - mul x25, x9, x21 - umulh x26, x9, x21 - adds x5, x5, x25 - adcs x6, x6, x26 - adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x8, x23 - umulh x26, x8, x23 - adds x6, x6, x25 - adc x7, x7, x26 - # A[1] * B[1] - mul x25, x9, x22 - umulh x26, x9, x22 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x16, xzr, xzr - # A[2] * B[0] - mul x25, x10, x21 - umulh x26, x10, x21 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x16, x16, xzr - # A[0] * B[3] - mul x25, x8, x24 - umulh x26, x8, x24 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, xzr, xzr - # A[1] * B[2] - mul x25, x9, x23 - umulh x26, x9, x23 - adds x7, x7, x25 - adcs x16, x16, x26 - adc x17, x17, xzr - # A[2] * B[1] - mul x25, x10, x22 - umulh x26, x10, x22 - adds x7, x7, x25 + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x12, x13, [x3] + ldp x14, x15, [x3, #16] + # A[0] * B[0] + umulh x9, x4, x12 + mul x8, x4, x12 + # A[2] * B[0] + umulh x11, x6, x12 + mul x10, x6, x12 + # A[1] * B[0] + mul x25, x5, x12 + adds x9, x9, x25 + umulh x26, x5, x12 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[1] * B[3] + umulh x17, x5, x15 + mul x16, x5, x15 + # A[0] * B[1] + mul x25, x4, x13 + adds x9, x9, x25 + umulh x26, x4, x13 + adcs x10, x10, x26 + # A[2] * B[1] + mul x25, x6, x13 + adcs x11, x11, x25 + umulh x26, x6, x13 adcs x16, x16, x26 adc x17, x17, xzr - # A[3] * B[0] - mul x25, x11, x21 - umulh x26, x11, x21 - adds x7, x7, x25 + # A[1] * B[2] + mul x25, x5, x14 + adds x11, x11, x25 + umulh x26, x5, x14 adcs x16, x16, x26 - adc x17, x17, xzr - # A[1] * B[3] - mul x25, x9, x24 - umulh x26, x9, x24 - adds x16, x16, x25 - adcs x17, x17, x26 + adcs x17, x17, xzr adc x19, xzr, xzr - # A[2] * B[2] - mul x25, x10, x23 - umulh x26, x10, x23 - adds x16, x16, x25 + # A[0] * B[2] + mul x25, x4, x14 + adds x10, x10, x25 + umulh x26, x4, x14 + adcs x11, x11, x26 + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x19, x19, xzr + # A[1] * B[1] + mul x25, x5, x13 + adds x10, x10, x25 + umulh x26, x5, x13 + adcs x11, x11, x26 + # A[3] * B[1] + mul x25, x7, x13 + adcs x16, x16, x25 + umulh x26, x7, x13 adcs x17, x17, x26 adc x19, x19, xzr - # A[3] * B[1] - mul x25, x11, x22 - umulh x26, x11, x22 + # A[2] * B[2] + mul x25, x6, x14 adds x16, x16, x25 + umulh x26, x6, x14 adcs x17, x17, x26 - adc x19, x19, xzr - # A[2] * B[3] - mul x25, x10, x24 - umulh x26, x10, x24 - adds x17, x17, x25 + # A[3] * B[3] + mul x25, x7, x15 + adcs x19, x19, x25 + umulh x20, x7, x15 + adc x20, x20, xzr + # A[0] * B[3] + mul x25, x4, x15 + adds x11, x11, x25 + umulh x26, x4, x15 + adcs x16, x16, x26 + # A[2] * B[3] + mul x25, x6, x15 + adcs x17, x17, x25 + umulh x26, x6, x15 adcs x19, x19, x26 - adc x20, xzr, xzr - # A[3] * B[2] - mul x25, x11, x23 - umulh x26, x11, x23 - adds x17, x17, x25 + adc x20, x20, xzr + # A[3] * B[0] + mul x25, x7, x12 + adds x11, x11, x25 + umulh x26, x7, x12 + adcs x16, x16, x26 + # A[3] * B[2] + mul x25, x7, x14 + adcs x17, x17, x25 + umulh x26, x7, x14 adcs x19, x19, x26 adc x20, x20, xzr - # A[3] * B[3] - mul x25, x11, x24 - umulh x26, x11, x24 - adds x19, x19, x25 - adc x20, x20, x26 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x20, x20, x19, #63 - extr x19, x19, x17, #63 - extr x17, x17, x16, #63 - extr x16, x16, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 + mov x25, #38 + mul x26, x25, x20 + adds x11, x11, x26 + umulh x27, x25, x20 + adc x27, x27, xzr mov x25, #19 + extr x27, x27, x11, #63 + mul x27, x27, x25 + and x11, x11, #0x7fffffffffffffff + mov x25, #38 mul x26, x25, x16 + adds x8, x8, x26 umulh x16, x25, x16 - adds x4, x4, x26 mul x26, x25, x17 + adcs x9, x9, x26 umulh x17, x25, x17 - adcs x5, x5, x26 mul x26, x25, x19 + adcs x10, x10, x26 umulh x19, x25, x19 - adcs x6, x6, x26 - mul x26, x25, x20 - umulh x27, x25, x20 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x16 - adcs x6, x6, x17 - adcs x7, x7, x19 - adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 - mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr - # Store - ldr x0, [x29, #24] - ldr x1, [x29, #16] - # Add - adds x8, x12, x4 - adcs x9, x13, x5 - adcs x10, x14, x6 - adc x11, x15, x7 + adc x11, x11, xzr + # Add high product results in + adds x8, x8, x27 + adcs x9, x9, x16 + adcs x10, x10, x17 + adc x11, x11, x19 + # Double + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 mov x25, #-19 asr x28, x11, #63 # Mask the modulus @@ -7118,375 +7623,1225 @@ _fe_ge_sub: sbcs x9, x9, x28 sbcs x10, x10, x28 sbc x11, x11, x26 + mov x3, x0 + sub x2, x0, #32 + mov x1, x0 + sub x0, x0, #32 + # Add + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + adds x21, x8, x4 + adcs x22, x9, x5 + adcs x23, x10, x6 + adcs x24, x11, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x24, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x21, x21, x25 + adcs x22, x22, xzr + and x24, x24, #0x7fffffffffffffff + adcs x23, x23, xzr + adc x24, x24, xzr # Sub - subs x16, x12, x4 - sbcs x17, x13, x5 - sbcs x19, x14, x6 - sbcs x20, x15, x7 - mov x25, #-19 + subs x12, x8, x4 + sbcs x13, x9, x5 + sbcs x14, x10, x6 + sbcs x15, x11, x7 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + mov x25, #-19 + extr x28, x28, x15, #63 + mul x25, x28, x25 # Add modulus (if underflow) + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + stp x21, x22, [x0] + stp x23, x24, [x0, #16] + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + ldr x17, [x29, #56] + ldr x19, [x29, #64] + ldp x20, x21, [x29, #72] + ldp x22, x23, [x29, #88] + ldp x24, x25, [x29, #104] + ldp x26, x27, [x29, #120] + ldr x28, [x29, #136] + ldp x29, x30, [sp], #0x90 + ret +#ifndef __APPLE__ + .size ge_add,.-ge_add +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl ge_sub +.type ge_sub,@function +.align 2 +ge_sub: +#else +.section __TEXT,__text +.globl _ge_sub +.p2align 2 +_ge_sub: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + str x17, [x29, #56] + str x19, [x29, #64] + stp x20, x21, [x29, #72] + stp x22, x23, [x29, #88] + stp x24, x25, [x29, #104] + stp x26, x27, [x29, #120] + str x28, [x29, #136] + str x0, [x29, #16] + str x1, [x29, #24] + str x2, [x29, #32] + mov x3, x1 + add x2, x1, #32 + add x1, x0, #32 + # Add + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + adds x16, x8, x4 + adcs x17, x9, x5 + adcs x19, x10, x6 + adcs x20, x11, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x20, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 - stp x8, x9, [x0] - stp x10, x11, [x0, #16] - stp x16, x17, [x1] - stp x19, x20, [x1, #16] - ldr x0, [x29, #48] - ldr x1, [x29, #64] - ldr x2, [x29, #176] + adcs x17, x17, xzr + and x20, x20, #0x7fffffffffffffff + adcs x19, x19, xzr + adc x20, x20, xzr + # Sub + subs x12, x8, x4 + sbcs x13, x9, x5 + sbcs x14, x10, x6 + sbcs x15, x11, x7 + csetm x28, cc + mov x25, #-19 + extr x28, x28, x15, #63 + mul x25, x28, x25 + # Add modulus (if underflow) + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + ldr x2, [x29, #32] + add x2, x2, #32 + mov x1, x0 + # Multiply + ldp x8, x9, [x2] + ldp x10, x11, [x2, #16] + # A[0] * B[0] + umulh x22, x16, x8 + mul x21, x16, x8 + # A[2] * B[0] + umulh x24, x19, x8 + mul x23, x19, x8 + # A[1] * B[0] + mul x25, x17, x8 + adds x22, x22, x25 + umulh x26, x17, x8 + adcs x23, x23, x26 + adc x24, x24, xzr + # A[1] * B[3] + umulh x5, x17, x11 + mul x4, x17, x11 + # A[0] * B[1] + mul x25, x16, x9 + adds x22, x22, x25 + umulh x26, x16, x9 + adcs x23, x23, x26 + # A[2] * B[1] + mul x25, x19, x9 + adcs x24, x24, x25 + umulh x26, x19, x9 + adcs x4, x4, x26 + adc x5, x5, xzr + # A[1] * B[2] + mul x25, x17, x10 + adds x24, x24, x25 + umulh x26, x17, x10 + adcs x4, x4, x26 + adcs x5, x5, xzr + adc x6, xzr, xzr + # A[0] * B[2] + mul x25, x16, x10 + adds x23, x23, x25 + umulh x26, x16, x10 + adcs x24, x24, x26 + adcs x4, x4, xzr + adcs x5, x5, xzr + adc x6, x6, xzr + # A[1] * B[1] + mul x25, x17, x9 + adds x23, x23, x25 + umulh x26, x17, x9 + adcs x24, x24, x26 + # A[3] * B[1] + mul x25, x20, x9 + adcs x4, x4, x25 + umulh x26, x20, x9 + adcs x5, x5, x26 + adc x6, x6, xzr + # A[2] * B[2] + mul x25, x19, x10 + adds x4, x4, x25 + umulh x26, x19, x10 + adcs x5, x5, x26 + # A[3] * B[3] + mul x25, x20, x11 + adcs x6, x6, x25 + umulh x7, x20, x11 + adc x7, x7, xzr + # A[0] * B[3] + mul x25, x16, x11 + adds x24, x24, x25 + umulh x26, x16, x11 + adcs x4, x4, x26 + # A[2] * B[3] + mul x25, x19, x11 + adcs x5, x5, x25 + umulh x26, x19, x11 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[3] * B[0] + mul x25, x20, x8 + adds x24, x24, x25 + umulh x26, x20, x8 + adcs x4, x4, x26 + # A[3] * B[2] + mul x25, x20, x10 + adcs x5, x5, x25 + umulh x26, x20, x10 + adcs x6, x6, x26 + adc x7, x7, xzr + # Reduce + mov x25, #38 + mul x26, x25, x7 + adds x24, x24, x26 + umulh x27, x25, x7 + adc x27, x27, xzr + mov x25, #19 + extr x27, x27, x24, #63 + mul x27, x27, x25 + and x24, x24, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x4 + adds x21, x21, x26 + umulh x4, x25, x4 + mul x26, x25, x5 + adcs x22, x22, x26 + umulh x5, x25, x5 + mul x26, x25, x6 + adcs x23, x23, x26 + umulh x6, x25, x6 + adc x24, x24, xzr + # Add high product results in + adds x21, x21, x27 + adcs x22, x22, x4 + adcs x23, x23, x5 + adc x24, x24, x6 + # Reduce if top bit set + mov x25, #19 + and x26, x25, x24, asr 63 + adds x21, x21, x26 + adcs x22, x22, xzr + and x24, x24, #0x7fffffffffffffff + adcs x23, x23, xzr + adc x24, x24, xzr + # Store + stp x21, x22, [x0] + stp x23, x24, [x0, #16] + sub x2, x2, #32 + add x1, x0, #32 + add x0, x0, #32 # Multiply - ldp x12, x13, [x1] - ldp x14, x15, [x1, #16] ldp x16, x17, [x2] ldp x19, x20, [x2, #16] - # A[0] * B[0] - mul x4, x12, x16 + # A[0] * B[0] umulh x5, x12, x16 - # A[0] * B[1] - mul x25, x12, x17 - umulh x6, x12, x17 - adds x5, x5, x25 - adc x6, x6, xzr - # A[1] * B[0] + mul x4, x12, x16 + # A[2] * B[0] + umulh x7, x14, x16 + mul x6, x14, x16 + # A[1] * B[0] mul x25, x13, x16 + adds x5, x5, x25 umulh x26, x13, x16 + adcs x6, x6, x26 + adc x7, x7, xzr + # A[1] * B[3] + umulh x9, x13, x20 + mul x8, x13, x20 + # A[0] * B[1] + mul x25, x12, x17 adds x5, x5, x25 + umulh x26, x12, x17 adcs x6, x6, x26 - adc x7, xzr, xzr - # A[0] * B[2] - mul x25, x12, x19 - umulh x26, x12, x19 - adds x6, x6, x25 - adc x7, x7, x26 - # A[1] * B[1] - mul x25, x13, x17 - umulh x26, x13, x17 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x8, xzr, xzr - # A[2] * B[0] - mul x25, x14, x16 - umulh x26, x14, x16 - adds x6, x6, x25 - adcs x7, x7, x26 - adc x8, x8, xzr - # A[0] * B[3] - mul x25, x12, x20 - umulh x26, x12, x20 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, xzr, xzr - # A[1] * B[2] - mul x25, x13, x19 - umulh x26, x13, x19 - adds x7, x7, x25 - adcs x8, x8, x26 - adc x9, x9, xzr - # A[2] * B[1] + # A[2] * B[1] mul x25, x14, x17 + adcs x7, x7, x25 umulh x26, x14, x17 - adds x7, x7, x25 adcs x8, x8, x26 adc x9, x9, xzr - # A[3] * B[0] - mul x25, x15, x16 - umulh x26, x15, x16 + # A[1] * B[2] + mul x25, x13, x19 adds x7, x7, x25 + umulh x26, x13, x19 adcs x8, x8, x26 - adc x9, x9, xzr - # A[1] * B[3] - mul x25, x13, x20 - umulh x26, x13, x20 - adds x8, x8, x25 - adcs x9, x9, x26 + adcs x9, x9, xzr adc x10, xzr, xzr - # A[2] * B[2] - mul x25, x14, x19 - umulh x26, x14, x19 - adds x8, x8, x25 - adcs x9, x9, x26 + # A[0] * B[2] + mul x25, x12, x19 + adds x6, x6, x25 + umulh x26, x12, x19 + adcs x7, x7, x26 + adcs x8, x8, xzr + adcs x9, x9, xzr adc x10, x10, xzr - # A[3] * B[1] + # A[1] * B[1] + mul x25, x13, x17 + adds x6, x6, x25 + umulh x26, x13, x17 + adcs x7, x7, x26 + # A[3] * B[1] mul x25, x15, x17 + adcs x8, x8, x25 umulh x26, x15, x17 - adds x8, x8, x25 adcs x9, x9, x26 adc x10, x10, xzr - # A[2] * B[3] + # A[2] * B[2] + mul x25, x14, x19 + adds x8, x8, x25 + umulh x26, x14, x19 + adcs x9, x9, x26 + # A[3] * B[3] + mul x25, x15, x20 + adcs x10, x10, x25 + umulh x11, x15, x20 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x12, x20 + adds x7, x7, x25 + umulh x26, x12, x20 + adcs x8, x8, x26 + # A[2] * B[3] mul x25, x14, x20 + adcs x9, x9, x25 umulh x26, x14, x20 - adds x9, x9, x25 adcs x10, x10, x26 - adc x11, xzr, xzr - # A[3] * B[2] + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x15, x16 + adds x7, x7, x25 + umulh x26, x15, x16 + adcs x8, x8, x26 + # A[3] * B[2] mul x25, x15, x19 + adcs x9, x9, x25 umulh x26, x15, x19 - adds x9, x9, x25 adcs x10, x10, x26 adc x11, x11, xzr - # A[3] * B[3] - mul x25, x15, x20 - umulh x26, x15, x20 - adds x10, x10, x25 - adc x11, x11, x26 # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x11, x11, x10, #63 - extr x10, x10, x9, #63 - extr x9, x9, x8, #63 - extr x8, x8, x7, #63 - and x7, x7, #0x7fffffffffffffff - # Multiply top half by 19 + mov x25, #38 + mul x26, x25, x11 + adds x7, x7, x26 + umulh x27, x25, x11 + adc x27, x27, xzr mov x25, #19 + extr x27, x27, x7, #63 + mul x27, x27, x25 + and x7, x7, #0x7fffffffffffffff + mov x25, #38 mul x26, x25, x8 - umulh x8, x25, x8 adds x4, x4, x26 + umulh x8, x25, x8 mul x26, x25, x9 - umulh x9, x25, x9 adcs x5, x5, x26 + umulh x9, x25, x9 mul x26, x25, x10 - umulh x10, x25, x10 adcs x6, x6, x26 + umulh x10, x25, x10 + adc x7, x7, xzr + # Add high product results in + adds x4, x4, x27 + adcs x5, x5, x8 + adcs x6, x6, x9 + adc x7, x7, x10 + # Store + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + mov x3, x0 + sub x2, x0, #32 + sub x1, x0, #32 + # Add + adds x8, x21, x4 + adcs x9, x22, x5 + adcs x10, x23, x6 + adcs x11, x24, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x11, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x8, x8, x25 + adcs x9, x9, xzr + and x11, x11, #0x7fffffffffffffff + adcs x10, x10, xzr + adc x11, x11, xzr + # Sub + subs x12, x21, x4 + sbcs x13, x22, x5 + sbcs x14, x23, x6 + sbcs x15, x24, x7 + csetm x28, cc + mov x25, #-19 + extr x28, x28, x15, #63 + mul x25, x28, x25 + # Add modulus (if underflow) + subs x12, x12, x25 + sbcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + sbcs x14, x14, xzr + sbc x15, x15, xzr + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + stp x12, x13, [x1] + stp x14, x15, [x1, #16] + ldr x1, [x29, #24] + ldr x2, [x29, #32] + add x2, x2, #0x60 + add x1, x1, #0x60 + add x0, x0, #0x40 + # Multiply + ldp x21, x22, [x1] + ldp x23, x24, [x1, #16] + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + # A[0] * B[0] + umulh x17, x21, x4 + mul x16, x21, x4 + # A[2] * B[0] + umulh x20, x23, x4 + mul x19, x23, x4 + # A[1] * B[0] + mul x25, x22, x4 + adds x17, x17, x25 + umulh x26, x22, x4 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[1] * B[3] + umulh x9, x22, x7 + mul x8, x22, x7 + # A[0] * B[1] + mul x25, x21, x5 + adds x17, x17, x25 + umulh x26, x21, x5 + adcs x19, x19, x26 + # A[2] * B[1] + mul x25, x23, x5 + adcs x20, x20, x25 + umulh x26, x23, x5 + adcs x8, x8, x26 + adc x9, x9, xzr + # A[1] * B[2] + mul x25, x22, x6 + adds x20, x20, x25 + umulh x26, x22, x6 + adcs x8, x8, x26 + adcs x9, x9, xzr + adc x10, xzr, xzr + # A[0] * B[2] + mul x25, x21, x6 + adds x19, x19, x25 + umulh x26, x21, x6 + adcs x20, x20, x26 + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + # A[1] * B[1] + mul x25, x22, x5 + adds x19, x19, x25 + umulh x26, x22, x5 + adcs x20, x20, x26 + # A[3] * B[1] + mul x25, x24, x5 + adcs x8, x8, x25 + umulh x26, x24, x5 + adcs x9, x9, x26 + adc x10, x10, xzr + # A[2] * B[2] + mul x25, x23, x6 + adds x8, x8, x25 + umulh x26, x23, x6 + adcs x9, x9, x26 + # A[3] * B[3] + mul x25, x24, x7 + adcs x10, x10, x25 + umulh x11, x24, x7 + adc x11, x11, xzr + # A[0] * B[3] + mul x25, x21, x7 + adds x20, x20, x25 + umulh x26, x21, x7 + adcs x8, x8, x26 + # A[2] * B[3] + mul x25, x23, x7 + adcs x9, x9, x25 + umulh x26, x23, x7 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[3] * B[0] + mul x25, x24, x4 + adds x20, x20, x25 + umulh x26, x24, x4 + adcs x8, x8, x26 + # A[3] * B[2] + mul x25, x24, x6 + adcs x9, x9, x25 + umulh x26, x24, x6 + adcs x10, x10, x26 + adc x11, x11, xzr + # Reduce + mov x25, #38 mul x26, x25, x11 + adds x20, x20, x26 umulh x27, x25, x11 - adcs x7, x7, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x5, x5, x8 - adcs x6, x6, x9 - adcs x7, x7, x10 adc x27, x27, xzr - # Overflow - extr x27, x27, x7, #63 + mov x25, #19 + extr x27, x27, x20, #63 mul x27, x27, x25 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + and x20, x20, #0x7fffffffffffffff + mov x25, #38 + mul x26, x25, x8 + adds x16, x16, x26 + umulh x8, x25, x8 + mul x26, x25, x9 + adcs x17, x17, x26 + umulh x9, x25, x9 + mul x26, x25, x10 + adcs x19, x19, x26 + umulh x10, x25, x10 + adc x20, x20, xzr + # Add high product results in + adds x16, x16, x27 + adcs x17, x17, x8 + adcs x19, x19, x9 + adc x20, x20, x10 # Reduce if top bit set - and x27, x25, x7, asr 63 - and x7, x7, #0x7fffffffffffffff - adds x4, x4, x27 - adcs x5, x5, xzr - adcs x6, x6, xzr - adc x7, x7, xzr + mov x25, #19 + and x26, x25, x20, asr 63 + adds x16, x16, x26 + adcs x17, x17, xzr + and x20, x20, #0x7fffffffffffffff + adcs x19, x19, xzr + adc x20, x20, xzr # Store - ldr x0, [x29, #48] - # Double - adds x4, x4, x4 - adcs x5, x5, x5 - adcs x6, x6, x6 - adc x7, x7, x7 - mov x25, #-19 - asr x28, x7, #63 - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff - # Sub modulus (if overflow) - subs x4, x4, x25 - sbcs x5, x5, x28 - sbcs x6, x6, x28 - sbc x7, x7, x26 - ldr x0, [x29, #40] - ldr x1, [x29, #184] - ldr x2, [x29, #72] + stp x16, x17, [x0] + stp x19, x20, [x0, #16] + sub x3, x2, #32 + sub x2, x1, #32 + sub x1, x0, #32 # Multiply - ldp x16, x17, [x1] - ldp x19, x20, [x1, #16] - ldp x21, x22, [x2] - ldp x23, x24, [x2, #16] - # A[0] * B[0] - mul x8, x16, x21 - umulh x9, x16, x21 - # A[0] * B[1] - mul x25, x16, x22 - umulh x10, x16, x22 + ldp x4, x5, [x2] + ldp x6, x7, [x2, #16] + ldp x12, x13, [x3] + ldp x14, x15, [x3, #16] + # A[0] * B[0] + umulh x9, x4, x12 + mul x8, x4, x12 + # A[2] * B[0] + umulh x11, x6, x12 + mul x10, x6, x12 + # A[1] * B[0] + mul x25, x5, x12 adds x9, x9, x25 - adc x10, x10, xzr - # A[1] * B[0] - mul x25, x17, x21 - umulh x26, x17, x21 + umulh x26, x5, x12 + adcs x10, x10, x26 + adc x11, x11, xzr + # A[1] * B[3] + umulh x17, x5, x15 + mul x16, x5, x15 + # A[0] * B[1] + mul x25, x4, x13 adds x9, x9, x25 + umulh x26, x4, x13 adcs x10, x10, x26 - adc x11, xzr, xzr - # A[0] * B[2] - mul x25, x16, x23 - umulh x26, x16, x23 - adds x10, x10, x25 - adc x11, x11, x26 - # A[1] * B[1] - mul x25, x17, x22 - umulh x26, x17, x22 + # A[2] * B[1] + mul x25, x6, x13 + adcs x11, x11, x25 + umulh x26, x6, x13 + adcs x16, x16, x26 + adc x17, x17, xzr + # A[1] * B[2] + mul x25, x5, x14 + adds x11, x11, x25 + umulh x26, x5, x14 + adcs x16, x16, x26 + adcs x17, x17, xzr + adc x19, xzr, xzr + # A[0] * B[2] + mul x25, x4, x14 adds x10, x10, x25 + umulh x26, x4, x14 adcs x11, x11, x26 - adc x12, xzr, xzr - # A[2] * B[0] - mul x25, x19, x21 - umulh x26, x19, x21 + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x19, x19, xzr + # A[1] * B[1] + mul x25, x5, x13 adds x10, x10, x25 + umulh x26, x5, x13 adcs x11, x11, x26 - adc x12, x12, xzr - # A[0] * B[3] - mul x25, x16, x24 - umulh x26, x16, x24 - adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, xzr, xzr - # A[1] * B[2] - mul x25, x17, x23 - umulh x26, x17, x23 - adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, x13, xzr - # A[2] * B[1] - mul x25, x19, x22 - umulh x26, x19, x22 + # A[3] * B[1] + mul x25, x7, x13 + adcs x16, x16, x25 + umulh x26, x7, x13 + adcs x17, x17, x26 + adc x19, x19, xzr + # A[2] * B[2] + mul x25, x6, x14 + adds x16, x16, x25 + umulh x26, x6, x14 + adcs x17, x17, x26 + # A[3] * B[3] + mul x25, x7, x15 + adcs x19, x19, x25 + umulh x20, x7, x15 + adc x20, x20, xzr + # A[0] * B[3] + mul x25, x4, x15 adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, x13, xzr - # A[3] * B[0] - mul x25, x20, x21 - umulh x26, x20, x21 + umulh x26, x4, x15 + adcs x16, x16, x26 + # A[2] * B[3] + mul x25, x6, x15 + adcs x17, x17, x25 + umulh x26, x6, x15 + adcs x19, x19, x26 + adc x20, x20, xzr + # A[3] * B[0] + mul x25, x7, x12 adds x11, x11, x25 - adcs x12, x12, x26 - adc x13, x13, xzr - # A[1] * B[3] - mul x25, x17, x24 - umulh x26, x17, x24 - adds x12, x12, x25 - adcs x13, x13, x26 - adc x14, xzr, xzr - # A[2] * B[2] - mul x25, x19, x23 - umulh x26, x19, x23 - adds x12, x12, x25 - adcs x13, x13, x26 - adc x14, x14, xzr - # A[3] * B[1] - mul x25, x20, x22 - umulh x26, x20, x22 - adds x12, x12, x25 - adcs x13, x13, x26 - adc x14, x14, xzr - # A[2] * B[3] - mul x25, x19, x24 - umulh x26, x19, x24 - adds x13, x13, x25 - adcs x14, x14, x26 - adc x15, xzr, xzr - # A[3] * B[2] - mul x25, x20, x23 - umulh x26, x20, x23 - adds x13, x13, x25 - adcs x14, x14, x26 - adc x15, x15, xzr - # A[3] * B[3] - mul x25, x20, x24 - umulh x26, x20, x24 - adds x14, x14, x25 - adc x15, x15, x26 + umulh x26, x7, x12 + adcs x16, x16, x26 + # A[3] * B[2] + mul x25, x7, x14 + adcs x17, x17, x25 + umulh x26, x7, x14 + adcs x19, x19, x26 + adc x20, x20, xzr # Reduce - # Move top half into t4-t7 and remove top bit from t3 - extr x15, x15, x14, #63 - extr x14, x14, x13, #63 - extr x13, x13, x12, #63 - extr x12, x12, x11, #63 - and x11, x11, #0x7fffffffffffffff - # Multiply top half by 19 - mov x25, #19 - mul x26, x25, x12 - umulh x12, x25, x12 - adds x8, x8, x26 - mul x26, x25, x13 - umulh x13, x25, x13 - adcs x9, x9, x26 - mul x26, x25, x14 - umulh x14, x25, x14 - adcs x10, x10, x26 - mul x26, x25, x15 - umulh x27, x25, x15 - adcs x11, x11, x26 - adc x27, x27, xzr - # Add remaining product results in - adds x9, x9, x12 - adcs x10, x10, x13 - adcs x11, x11, x14 + mov x25, #38 + mul x26, x25, x20 + adds x11, x11, x26 + umulh x27, x25, x20 adc x27, x27, xzr - # Overflow + mov x25, #19 extr x27, x27, x11, #63 mul x27, x27, x25 and x11, x11, #0x7fffffffffffffff - adds x8, x8, x27 - adcs x9, x9, xzr - adcs x10, x10, xzr + mov x25, #38 + mul x26, x25, x16 + adds x8, x8, x26 + umulh x16, x25, x16 + mul x26, x25, x17 + adcs x9, x9, x26 + umulh x17, x25, x17 + mul x26, x25, x19 + adcs x10, x10, x26 + umulh x19, x25, x19 adc x11, x11, xzr - # Reduce if top bit set - and x27, x25, x11, asr 63 - and x11, x11, #0x7fffffffffffffff + # Add high product results in adds x8, x8, x27 - adcs x9, x9, xzr - adcs x10, x10, xzr - adc x11, x11, xzr - # Store - ldr x0, [x29, #40] - ldr x1, [x29, #32] - # Add - adds x12, x4, x8 - adcs x13, x5, x9 - adcs x14, x6, x10 - adc x15, x7, x11 + adcs x9, x9, x16 + adcs x10, x10, x17 + adc x11, x11, x19 + # Double + adds x8, x8, x8 + adcs x9, x9, x9 + adcs x10, x10, x10 + adc x11, x11, x11 mov x25, #-19 - asr x28, x15, #63 + asr x28, x11, #63 # Mask the modulus and x25, x28, x25 and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) - subs x12, x12, x25 - sbcs x13, x13, x28 - sbcs x14, x14, x28 - sbc x15, x15, x26 + subs x8, x8, x25 + sbcs x9, x9, x28 + sbcs x10, x10, x28 + sbc x11, x11, x26 + mov x3, x0 + sub x2, x0, #32 + # Add + ldp x4, x5, [x3] + ldp x6, x7, [x3, #16] + adds x12, x8, x4 + adcs x13, x9, x5 + adcs x14, x10, x6 + adcs x15, x11, x7 + cset x28, cs + mov x25, #19 + extr x28, x28, x15, #63 + mul x25, x28, x25 + # Sub modulus (if overflow) + adds x12, x12, x25 + adcs x13, x13, xzr + and x15, x15, #0x7fffffffffffffff + adcs x14, x14, xzr + adc x15, x15, xzr # Sub - subs x16, x4, x8 - sbcs x17, x5, x9 - sbcs x19, x6, x10 - sbcs x20, x7, x11 - mov x25, #-19 + subs x21, x8, x4 + sbcs x22, x9, x5 + sbcs x23, x10, x6 + sbcs x24, x11, x7 csetm x28, cc - # Mask the modulus - and x25, x28, x25 - and x26, x28, #0x7fffffffffffffff + mov x25, #-19 + extr x28, x28, x24, #63 + mul x25, x28, x25 # Add modulus (if underflow) - adds x16, x16, x25 - adcs x17, x17, x28 - adcs x19, x19, x28 - adc x20, x20, x26 + subs x21, x21, x25 + sbcs x22, x22, xzr + and x24, x24, #0x7fffffffffffffff + sbcs x23, x23, xzr + sbc x24, x24, xzr stp x12, x13, [x0] stp x14, x15, [x0, #16] - stp x16, x17, [x1] - stp x19, x20, [x1, #16] - ldr x17, [x29, #88] - ldr x19, [x29, #96] - ldp x20, x21, [x29, #104] - ldp x22, x23, [x29, #120] - ldp x24, x25, [x29, #136] - ldp x26, x27, [x29, #152] - ldr x28, [x29, #168] - ldp x29, x30, [sp], #0xb0 + stp x21, x22, [x1] + stp x23, x24, [x1, #16] + ldr x17, [x29, #56] + ldr x19, [x29, #64] + ldp x20, x21, [x29, #72] + ldp x22, x23, [x29, #88] + ldp x24, x25, [x29, #104] + ldp x26, x27, [x29, #120] + ldr x28, [x29, #136] + ldp x29, x30, [sp], #0x90 + ret +#ifndef __APPLE__ + .size ge_sub,.-ge_sub +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl sc_reduce +.type sc_reduce,@function +.align 2 +sc_reduce: +#else +.section __TEXT,__text +.globl _sc_reduce +.p2align 2 +_sc_reduce: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-64]! + add x29, sp, #0 + str x17, [x29, #16] + str x19, [x29, #24] + stp x20, x21, [x29, #32] + stp x22, x23, [x29, #48] + ldp x2, x3, [x0] + ldp x4, x5, [x0, #16] + ldp x6, x7, [x0, #32] + ldp x8, x9, [x0, #48] + lsr x23, x9, #56 + lsl x9, x9, #4 + orr x9, x9, x8, lsr 60 + lsl x8, x8, #4 + orr x8, x8, x7, lsr 60 + lsl x7, x7, #4 + orr x7, x7, x6, lsr 60 + lsl x6, x6, #4 + mov x1, #15 + orr x6, x6, x5, lsr 60 + bic x5, x5, x1, lsl 60 + bic x9, x9, x1, lsl 60 + # Add order times bits 504..511 + mov x11, #0x2c13 + movk x11, #0xa30a, lsl 16 + movk x11, #0x9ce5, lsl 32 + movk x11, #0xa7ed, lsl 48 + mov x13, #0x6329 + movk x13, #0x5d08, lsl 16 + movk x13, #0x621, lsl 32 + movk x13, #0xeb21, lsl 48 + mul x10, x23, x11 + umulh x11, x23, x11 + mul x12, x23, x13 + umulh x13, x23, x13 + adds x6, x6, x10 + adcs x7, x7, x11 + adcs x8, x8, xzr + adc x9, x9, xzr + adds x7, x7, x12 + adcs x8, x8, x13 + adc x9, x9, xzr + subs x8, x8, x23 + sbc x9, x9, xzr + # Sub product of top 4 words and order + mov x1, #0x2c13 + movk x1, #0xa30a, lsl 16 + movk x1, #0x9ce5, lsl 32 + movk x1, #0xa7ed, lsl 48 + mul x10, x6, x1 + umulh x11, x6, x1 + mul x12, x7, x1 + umulh x13, x7, x1 + mul x14, x8, x1 + umulh x15, x8, x1 + mul x16, x9, x1 + umulh x17, x9, x1 + adds x2, x2, x10 + adcs x3, x3, x11 + adcs x4, x4, x14 + adcs x5, x5, x15 + adc x19, xzr, xzr + adds x3, x3, x12 + adcs x4, x4, x13 + adcs x5, x5, x16 + adc x19, x19, x17 + mov x1, #0x6329 + movk x1, #0x5d08, lsl 16 + movk x1, #0x621, lsl 32 + movk x1, #0xeb21, lsl 48 + mul x10, x6, x1 + umulh x11, x6, x1 + mul x12, x7, x1 + umulh x13, x7, x1 + mul x14, x8, x1 + umulh x15, x8, x1 + mul x16, x9, x1 + umulh x17, x9, x1 + adds x3, x3, x10 + adcs x4, x4, x11 + adcs x5, x5, x14 + adcs x19, x19, x15 + adc x20, xzr, xzr + adds x4, x4, x12 + adcs x5, x5, x13 + adcs x19, x19, x16 + adc x20, x20, x17 + subs x4, x4, x6 + sbcs x5, x5, x7 + sbcs x6, x19, x8 + sbc x7, x20, x9 + asr x23, x7, #57 + # Conditionally subtract order starting at bit 125 + mov x10, xzr + mov x13, xzr + mov x11, #0xba7d + movk x11, #0x4b9e, lsl 16 + movk x11, #0x4c63, lsl 32 + movk x11, #0xcb02, lsl 48 + mov x12, #0xf39a + movk x12, #0xd45e, lsl 16 + movk x12, #0xdf3b, lsl 32 + movk x12, #0x29b, lsl 48 + movk x10, #0xa000, lsl 48 + movk x13, #0x200, lsl 48 + and x10, x10, x23 + and x11, x11, x23 + and x12, x12, x23 + and x13, x13, x23 + adds x3, x3, x10 + adcs x4, x4, x11 + adcs x5, x5, x12 + adcs x6, x6, xzr + adc x7, x7, x13 + # Move bits 252-376 to own registers + lsl x7, x7, #4 + orr x7, x7, x6, lsr 60 + lsl x6, x6, #4 + mov x23, #15 + orr x6, x6, x5, lsr 60 + bic x5, x5, x23, lsl 60 + # Sub product of top 2 words and order + # * -5812631a5cf5d3ed + mov x1, #0x2c13 + movk x1, #0xa30a, lsl 16 + movk x1, #0x9ce5, lsl 32 + movk x1, #0xa7ed, lsl 48 + mul x10, x6, x1 + umulh x11, x6, x1 + mul x12, x7, x1 + umulh x13, x7, x1 + adds x2, x2, x10 + adcs x3, x3, x11 + adc x19, xzr, xzr + adds x3, x3, x12 + adc x19, x19, x13 + # * -14def9dea2f79cd7 + mov x1, #0x6329 + movk x1, #0x5d08, lsl 16 + movk x1, #0x621, lsl 32 + movk x1, #0xeb21, lsl 48 + mul x10, x6, x1 + umulh x11, x6, x1 + mul x12, x7, x1 + umulh x13, x7, x1 + adds x3, x3, x10 + adcs x4, x4, x11 + adc x20, xzr, xzr + adds x4, x4, x12 + adc x20, x20, x13 + # Add overflows at 2 * 64 + mov x1, #15 + bic x5, x5, x1, lsl 60 + adds x4, x4, x19 + adc x5, x5, x20 + # Subtract top at 2 * 64 + subs x4, x4, x6 + sbcs x5, x5, x7 + sbc x1, x1, x1 + # Conditional sub order + mov x10, #0xd3ed + movk x10, #0x5cf5, lsl 16 + movk x10, #0x631a, lsl 32 + movk x10, #0x5812, lsl 48 + mov x11, #0x9cd6 + movk x11, #0xa2f7, lsl 16 + movk x11, #0xf9de, lsl 32 + movk x11, #0x14de, lsl 48 + and x10, x10, x1 + and x11, x11, x1 + adds x2, x2, x10 + adcs x3, x3, x11 + and x1, x1, #0x1000000000000000 + adcs x4, x4, xzr + mov x23, #15 + adc x5, x5, x1 + bic x5, x5, x23, lsl 60 + # Store result + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ldr x17, [x29, #16] + ldr x19, [x29, #24] + ldp x20, x21, [x29, #32] + ldp x22, x23, [x29, #48] + ldp x29, x30, [sp], #0x40 + ret +#ifndef __APPLE__ + .size sc_reduce,.-sc_reduce +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl sc_muladd +.type sc_muladd,@function +.align 2 +sc_muladd: +#else +.section __TEXT,__text +.globl _sc_muladd +.p2align 2 +_sc_muladd: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-96]! + add x29, sp, #0 + str x17, [x29, #24] + str x19, [x29, #32] + stp x20, x21, [x29, #40] + stp x22, x23, [x29, #56] + stp x24, x25, [x29, #72] + str x26, [x29, #88] + # Multiply + ldp x12, x13, [x1] + ldp x14, x15, [x1, #16] + ldp x16, x17, [x2] + ldp x19, x20, [x2, #16] + # A[0] * B[0] + umulh x5, x12, x16 + mul x4, x12, x16 + # A[2] * B[0] + umulh x7, x14, x16 + mul x6, x14, x16 + # A[1] * B[0] + mul x21, x13, x16 + adds x5, x5, x21 + umulh x22, x13, x16 + adcs x6, x6, x22 + adc x7, x7, xzr + # A[1] * B[3] + umulh x9, x13, x20 + mul x8, x13, x20 + # A[0] * B[1] + mul x21, x12, x17 + adds x5, x5, x21 + umulh x22, x12, x17 + adcs x6, x6, x22 + # A[2] * B[1] + mul x21, x14, x17 + adcs x7, x7, x21 + umulh x22, x14, x17 + adcs x8, x8, x22 + adc x9, x9, xzr + # A[1] * B[2] + mul x21, x13, x19 + adds x7, x7, x21 + umulh x22, x13, x19 + adcs x8, x8, x22 + adcs x9, x9, xzr + adc x10, xzr, xzr + # A[0] * B[2] + mul x21, x12, x19 + adds x6, x6, x21 + umulh x22, x12, x19 + adcs x7, x7, x22 + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + # A[1] * B[1] + mul x21, x13, x17 + adds x6, x6, x21 + umulh x22, x13, x17 + adcs x7, x7, x22 + # A[3] * B[1] + mul x21, x15, x17 + adcs x8, x8, x21 + umulh x22, x15, x17 + adcs x9, x9, x22 + adc x10, x10, xzr + # A[2] * B[2] + mul x21, x14, x19 + adds x8, x8, x21 + umulh x22, x14, x19 + adcs x9, x9, x22 + # A[3] * B[3] + mul x21, x15, x20 + adcs x10, x10, x21 + umulh x11, x15, x20 + adc x11, x11, xzr + # A[0] * B[3] + mul x21, x12, x20 + adds x7, x7, x21 + umulh x22, x12, x20 + adcs x8, x8, x22 + # A[2] * B[3] + mul x21, x14, x20 + adcs x9, x9, x21 + umulh x22, x14, x20 + adcs x10, x10, x22 + adc x11, x11, xzr + # A[3] * B[0] + mul x21, x15, x16 + adds x7, x7, x21 + umulh x22, x15, x16 + adcs x8, x8, x22 + # A[3] * B[2] + mul x21, x15, x19 + adcs x9, x9, x21 + umulh x22, x15, x19 + adcs x10, x10, x22 + adc x11, x11, xzr + # Add c to a * b + ldp x12, x13, [x3] + ldp x14, x15, [x3, #16] + adds x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + lsr x25, x11, #56 + lsl x11, x11, #4 + orr x11, x11, x10, lsr 60 + lsl x10, x10, #4 + orr x10, x10, x9, lsr 60 + lsl x9, x9, #4 + orr x9, x9, x8, lsr 60 + lsl x8, x8, #4 + mov x26, #15 + orr x8, x8, x7, lsr 60 + bic x7, x7, x26, lsl 60 + bic x11, x11, x26, lsl 60 + # Add order times bits 504..507 + mov x22, #0x2c13 + movk x22, #0xa30a, lsl 16 + movk x22, #0x9ce5, lsl 32 + movk x22, #0xa7ed, lsl 48 + mov x24, #0x6329 + movk x24, #0x5d08, lsl 16 + movk x24, #0x621, lsl 32 + movk x24, #0xeb21, lsl 48 + mul x21, x25, x22 + umulh x22, x25, x22 + mul x23, x25, x24 + umulh x24, x25, x24 + adds x8, x8, x21 + adcs x9, x9, x22 + adcs x10, x10, xzr + adc x11, x11, xzr + adds x9, x9, x23 + adcs x10, x10, x24 + adc x11, x11, xzr + subs x10, x10, x25 + sbc x11, x11, xzr + # Sub product of top 4 words and order + mov x26, #0x2c13 + movk x26, #0xa30a, lsl 16 + movk x26, #0x9ce5, lsl 32 + movk x26, #0xa7ed, lsl 48 + mul x16, x8, x26 + umulh x17, x8, x26 + mul x19, x9, x26 + umulh x20, x9, x26 + mul x21, x10, x26 + umulh x22, x10, x26 + mul x23, x11, x26 + umulh x24, x11, x26 + adds x4, x4, x16 + adcs x5, x5, x17 + adcs x6, x6, x21 + adcs x7, x7, x22 + adc x12, xzr, xzr + adds x5, x5, x19 + adcs x6, x6, x20 + adcs x7, x7, x23 + adc x12, x12, x24 + mov x26, #0x6329 + movk x26, #0x5d08, lsl 16 + movk x26, #0x621, lsl 32 + movk x26, #0xeb21, lsl 48 + mul x16, x8, x26 + umulh x17, x8, x26 + mul x19, x9, x26 + umulh x20, x9, x26 + mul x21, x10, x26 + umulh x22, x10, x26 + mul x23, x11, x26 + umulh x24, x11, x26 + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x21 + adcs x12, x12, x22 + adc x13, xzr, xzr + adds x6, x6, x19 + adcs x7, x7, x20 + adcs x12, x12, x23 + adc x13, x13, x24 + subs x6, x6, x8 + sbcs x7, x7, x9 + sbcs x8, x12, x10 + sbc x9, x13, x11 + asr x25, x9, #57 + # Conditionally subtract order starting at bit 125 + mov x16, xzr + mov x20, xzr + mov x17, #0xba7d + movk x17, #0x4b9e, lsl 16 + movk x17, #0x4c63, lsl 32 + movk x17, #0xcb02, lsl 48 + mov x19, #0xf39a + movk x19, #0xd45e, lsl 16 + movk x19, #0xdf3b, lsl 32 + movk x19, #0x29b, lsl 48 + movk x16, #0xa000, lsl 48 + movk x20, #0x200, lsl 48 + and x16, x16, x25 + and x17, x17, x25 + and x19, x19, x25 + and x20, x20, x25 + adds x5, x5, x16 + adcs x6, x6, x17 + adcs x7, x7, x19 + adcs x8, x8, xzr + adc x9, x9, x20 + # Move bits 252-376 to own registers + lsl x9, x9, #4 + orr x9, x9, x8, lsr 60 + lsl x8, x8, #4 + mov x25, #15 + orr x8, x8, x7, lsr 60 + bic x7, x7, x25, lsl 60 + # Sub product of top 2 words and order + # * -5812631a5cf5d3ed + mov x26, #0x2c13 + movk x26, #0xa30a, lsl 16 + movk x26, #0x9ce5, lsl 32 + movk x26, #0xa7ed, lsl 48 + mul x16, x8, x26 + umulh x17, x8, x26 + mul x19, x9, x26 + umulh x20, x9, x26 + adds x4, x4, x16 + adcs x5, x5, x17 + adc x12, xzr, xzr + adds x5, x5, x19 + adc x12, x12, x20 + # * -14def9dea2f79cd7 + mov x26, #0x6329 + movk x26, #0x5d08, lsl 16 + movk x26, #0x621, lsl 32 + movk x26, #0xeb21, lsl 48 + mul x16, x8, x26 + umulh x17, x8, x26 + mul x19, x9, x26 + umulh x20, x9, x26 + adds x5, x5, x16 + adcs x6, x6, x17 + adc x13, xzr, xzr + adds x6, x6, x19 + adc x13, x13, x20 + # Add overflows at 2 * 64 + mov x26, #15 + bic x7, x7, x26, lsl 60 + adds x6, x6, x12 + adc x7, x7, x13 + # Subtract top at 2 * 64 + subs x6, x6, x8 + sbcs x7, x7, x9 + sbc x26, x26, x26 + # Conditional sub order + mov x16, #0xd3ed + movk x16, #0x5cf5, lsl 16 + movk x16, #0x631a, lsl 32 + movk x16, #0x5812, lsl 48 + mov x17, #0x9cd6 + movk x17, #0xa2f7, lsl 16 + movk x17, #0xf9de, lsl 32 + movk x17, #0x14de, lsl 48 + and x16, x16, x26 + and x17, x17, x26 + adds x4, x4, x16 + adcs x5, x5, x17 + and x26, x26, #0x1000000000000000 + adcs x6, x6, xzr + mov x25, #15 + adc x7, x7, x26 + bic x7, x7, x25, lsl 60 + # Store result + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ldr x17, [x29, #24] + ldr x19, [x29, #32] + ldp x20, x21, [x29, #40] + ldp x22, x23, [x29, #56] + ldp x24, x25, [x29, #72] + ldr x26, [x29, #88] + ldp x29, x30, [sp], #0x60 ret #ifndef __APPLE__ - .size fe_ge_sub,.-fe_ge_sub + .size sc_muladd,.-sc_muladd #endif /* __APPLE__ */ -#endif /* HAVE_CURVE25519 */ +#endif /* HAVE_ED25519 */ +#endif /* !CURVE25519_SMALL || !ED25519_SMALL */ +#endif /* HAVE_CURVE25519 || HAVE_ED25519 */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-curve25519_c.c index 3484b07bf0..2c97175197 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519_c.c @@ -30,19 +30,26 @@ */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ -#ifdef HAVE_CURVE25519 +#ifdef WOLFSSL_ARMASM_INLINE +#include +#define CURVED25519_ASM +#include + +#if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) +#if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) #include void fe_init() { __asm__ __volatile__ ( "\n\t" - : + : : : "memory" ); } +#ifdef HAVE_ED25519 void fe_frombytes(fe out, const unsigned char* in) { __asm__ __volatile__ ( @@ -132,16 +139,17 @@ void fe_sub(fe r, const fe a, const fe b) "sbcs x4, x4, x8\n\t" "sbcs x5, x5, x9\n\t" "sbcs x6, x6, x10\n\t" - "mov x12, #-19\n\t" "csetm x11, cc\n\t" + "mov x12, #-19\n\t" /* Mask the modulus */ - "and x12, x11, x12\n\t" - "and x13, x11, #0x7fffffffffffffff\n\t" + "extr x11, x11, x6, #63\n\t" + "mul x12, x11, x12\n\t" /* Add modulus (if underflow) */ - "adds x3, x3, x12\n\t" - "adcs x4, x4, x11\n\t" - "adcs x5, x5, x11\n\t" - "adc x6, x6, x13\n\t" + "subs x3, x3, x12\n\t" + "sbcs x4, x4, xzr\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "sbcs x5, x5, xzr\n\t" + "sbc x6, x6, xzr\n\t" "stp x3, x4, [%x[r]]\n\t" "stp x5, x6, [%x[r], #16]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -161,17 +169,18 @@ void fe_add(fe r, const fe a, const fe b) "adds x3, x3, x7\n\t" "adcs x4, x4, x8\n\t" "adcs x5, x5, x9\n\t" - "adc x6, x6, x10\n\t" - "mov x12, #-19\n\t" - "asr x11, x6, #63\n\t" + "adcs x6, x6, x10\n\t" + "cset x11, cs\n\t" + "mov x12, #19\n\t" /* Mask the modulus */ - "and x12, x11, x12\n\t" - "and x13, x11, #0x7fffffffffffffff\n\t" + "extr x11, x11, x6, #63\n\t" + "mul x12, x11, x12\n\t" /* Sub modulus (if overflow) */ - "subs x3, x3, x12\n\t" - "sbcs x4, x4, x11\n\t" - "sbcs x5, x5, x11\n\t" - "sbc x6, x6, x13\n\t" + "adds x3, x3, x12\n\t" + "adcs x4, x4, xzr\n\t" + "and x6, x6, #0x7fffffffffffffff\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" "stp x3, x4, [%x[r]]\n\t" "stp x5, x6, [%x[r], #16]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -460,6 +469,7 @@ void fe_cmov_table(fe* r, fe* base, signed char b) ); } +#endif /* HAVE_ED25519 */ void fe_mul(fe r, const fe a, const fe b) { __asm__ __volatile__ ( @@ -468,136 +478,122 @@ void fe_mul(fe r, const fe a, const fe b) "ldp x16, x17, [%x[a], #16]\n\t" "ldp x19, x20, [%x[b]]\n\t" "ldp x21, x22, [%x[b], #16]\n\t" - /* A[0] * B[0] */ - "mul x6, x14, x19\n\t" + /* A[0] * B[0] */ "umulh x7, x14, x19\n\t" - /* A[0] * B[1] */ - "mul x3, x14, x20\n\t" - "umulh x8, x14, x20\n\t" - "adds x7, x7, x3\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[0] */ + "mul x6, x14, x19\n\t" + /* A[2] * B[0] */ + "umulh x9, x16, x19\n\t" + "mul x8, x16, x19\n\t" + /* A[1] * B[0] */ "mul x3, x15, x19\n\t" + "adds x7, x7, x3\n\t" "umulh x4, x15, x19\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "umulh x11, x15, x22\n\t" + "mul x10, x15, x22\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x20\n\t" "adds x7, x7, x3\n\t" + "umulh x4, x14, x20\n\t" "adcs x8, x8, x4\n\t" - "adc x9, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x14, x21\n\t" - "umulh x4, x14, x21\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x15, x20\n\t" - "umulh x4, x15, x20\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x16, x19\n\t" - "umulh x4, x16, x19\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc x10, x10, xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x14, x22\n\t" - "umulh x4, x14, x22\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x15, x21\n\t" - "umulh x4, x15, x21\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[2] * B[1] */ + /* A[2] * B[1] */ "mul x3, x16, x20\n\t" + "adcs x9, x9, x3\n\t" "umulh x4, x16, x20\n\t" - "adds x9, x9, x3\n\t" "adcs x10, x10, x4\n\t" "adc x11, x11, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x17, x19\n\t" - "umulh x4, x17, x19\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x21\n\t" "adds x9, x9, x3\n\t" + "umulh x4, x15, x21\n\t" "adcs x10, x10, x4\n\t" - "adc x11, x11, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x15, x22\n\t" - "umulh x4, x15, x22\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" + "adcs x11, x11, xzr\n\t" "adc x12, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x16, x21\n\t" - "umulh x4, x16, x21\n\t" - "adds x10, x10, x3\n\t" - "adcs x11, x11, x4\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x21\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x21\n\t" + "adcs x9, x9, x4\n\t" + "adcs x10, x10, xzr\n\t" + "adcs x11, x11, xzr\n\t" "adc x12, x12, xzr\n\t" - /* A[3] * B[1] */ + /* A[1] * B[1] */ + "mul x3, x15, x20\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x9, x9, x4\n\t" + /* A[3] * B[1] */ "mul x3, x17, x20\n\t" + "adcs x10, x10, x3\n\t" "umulh x4, x17, x20\n\t" - "adds x10, x10, x3\n\t" "adcs x11, x11, x4\n\t" "adc x12, x12, xzr\n\t" - /* A[2] * B[3] */ + /* A[2] * B[2] */ + "mul x3, x16, x21\n\t" + "adds x10, x10, x3\n\t" + "umulh x4, x16, x21\n\t" + "adcs x11, x11, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x22\n\t" + "adcs x12, x12, x3\n\t" + "umulh x13, x17, x22\n\t" + "adc x13, x13, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x22\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x14, x22\n\t" + "adcs x10, x10, x4\n\t" + /* A[2] * B[3] */ "mul x3, x16, x22\n\t" + "adcs x11, x11, x3\n\t" "umulh x4, x16, x22\n\t" - "adds x11, x11, x3\n\t" "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" - /* A[3] * B[2] */ + "adc x13, x13, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x19\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x17, x19\n\t" + "adcs x10, x10, x4\n\t" + /* A[3] * B[2] */ "mul x3, x17, x21\n\t" + "adcs x11, x11, x3\n\t" "umulh x4, x17, x21\n\t" - "adds x11, x11, x3\n\t" "adcs x12, x12, x4\n\t" "adc x13, x13, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x17, x22\n\t" - "umulh x4, x17, x22\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x3, #38\n\t" + "mul x4, x3, x13\n\t" + "adds x9, x9, x4\n\t" + "umulh x5, x3, x13\n\t" + "adc x5, x5, xzr\n\t" "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x5, x5, x3\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" "mul x4, x3, x10\n\t" - "umulh x10, x3, x10\n\t" "adds x6, x6, x4\n\t" + "umulh x10, x3, x10\n\t" "mul x4, x3, x11\n\t" - "umulh x11, x3, x11\n\t" "adcs x7, x7, x4\n\t" + "umulh x11, x3, x11\n\t" "mul x4, x3, x12\n\t" - "umulh x12, x3, x12\n\t" "adcs x8, x8, x4\n\t" - "mul x4, x3, x13\n\t" - "umulh x5, x3, x13\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adcs x9, x9, x12\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" + "umulh x12, x3, x12\n\t" "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x9, asr 63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x6, x6, x5\n\t" + "adcs x7, x7, x10\n\t" + "adcs x8, x8, x11\n\t" + "adc x9, x9, x12\n\t" + /* Reduce if top bit set */ + "mov x3, #19\n\t" + "and x4, x3, x9, asr 63\n\t" + "adds x6, x6, x4\n\t" "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Store */ @@ -616,33 +612,30 @@ void fe_sq(fe r, const fe a) "ldp x13, x14, [%x[a]]\n\t" "ldp x15, x16, [%x[a], #16]\n\t" /* A[0] * A[1] */ - "mul x6, x13, x14\n\t" "umulh x7, x13, x14\n\t" + "mul x6, x13, x14\n\t" + /* A[0] * A[3] */ + "umulh x9, x13, x16\n\t" + "mul x8, x13, x16\n\t" /* A[0] * A[2] */ "mul x2, x13, x15\n\t" - "umulh x8, x13, x15\n\t" "adds x7, x7, x2\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * A[3] */ - "mul x2, x13, x16\n\t" - "umulh x9, x13, x16\n\t" - "adds x8, x8, x2\n\t" - "adc x9, x9, xzr\n\t" + "umulh x3, x13, x15\n\t" + "adcs x8, x8, x3\n\t" + /* A[1] * A[3] */ + "mul x2, x14, x16\n\t" + "adcs x9, x9, x2\n\t" + "umulh x10, x14, x16\n\t" + "adc x10, x10, xzr\n\t" /* A[1] * A[2] */ "mul x2, x14, x15\n\t" - "umulh x3, x14, x15\n\t" "adds x8, x8, x2\n\t" + "umulh x3, x14, x15\n\t" "adcs x9, x9, x3\n\t" - "adc x10, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x2, x14, x16\n\t" - "umulh x3, x14, x16\n\t" - "adds x9, x9, x2\n\t" - "adc x10, x10, x3\n\t" /* A[2] * A[3] */ "mul x2, x15, x16\n\t" + "adcs x10, x10, x2\n\t" "umulh x11, x15, x16\n\t" - "adds x10, x10, x2\n\t" "adc x11, x11, xzr\n\t" /* Double */ "adds x6, x6, x6\n\t" @@ -653,66 +646,56 @@ void fe_sq(fe r, const fe a) "adcs x11, x11, x11\n\t" "adc x12, xzr, xzr\n\t" /* A[0] * A[0] */ + "umulh x3, x13, x13\n\t" "mul x5, x13, x13\n\t" - "umulh x4, x13, x13\n\t" /* A[1] * A[1] */ "mul x2, x14, x14\n\t" + "adds x6, x6, x3\n\t" "umulh x3, x14, x14\n\t" - "adds x6, x6, x4\n\t" "adcs x7, x7, x2\n\t" - "adc x4, x3, xzr\n\t" /* A[2] * A[2] */ "mul x2, x15, x15\n\t" + "adcs x8, x8, x3\n\t" "umulh x3, x15, x15\n\t" - "adds x8, x8, x4\n\t" "adcs x9, x9, x2\n\t" - "adc x4, x3, xzr\n\t" /* A[3] * A[3] */ "mul x2, x16, x16\n\t" + "adcs x10, x10, x3\n\t" "umulh x3, x16, x16\n\t" - "adds x10, x10, x4\n\t" "adcs x11, x11, x2\n\t" "adc x12, x12, x3\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x12, x12, x11, #63\n\t" - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "and x8, x8, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x2, #38\n\t" + "mul x3, x2, x12\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x2, x12\n\t" + "adc x4, x4, xzr\n\t" "mov x2, #19\n\t" + "extr x4, x4, x8, #63\n\t" + "mul x4, x4, x2\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" + "mov x2, #38\n\t" "mul x3, x2, x9\n\t" - "umulh x9, x2, x9\n\t" "adds x5, x5, x3\n\t" + "umulh x9, x2, x9\n\t" "mul x3, x2, x10\n\t" - "umulh x10, x2, x10\n\t" "adcs x6, x6, x3\n\t" + "umulh x10, x2, x10\n\t" "mul x3, x2, x11\n\t" - "umulh x11, x2, x11\n\t" "adcs x7, x7, x3\n\t" - "mul x3, x2, x12\n\t" - "umulh x4, x2, x12\n\t" - "adcs x8, x8, x3\n\t" - "adc x4, x4, xzr\n\t" - /* Add remaining product results in */ - "adds x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adcs x8, x8, x11\n\t" - "adc x4, x4, xzr\n\t" - /* Overflow */ - "extr x4, x4, x8, #63\n\t" - "mul x4, x4, x2\n\t" - "and x8, x8, #0x7fffffffffffffff\n\t" - "adds x5, x5, x4\n\t" - "adcs x6, x6, xzr\n\t" - "adcs x7, x7, xzr\n\t" + "umulh x11, x2, x11\n\t" "adc x8, x8, xzr\n\t" - /* Reduce if top bit set */ - "and x4, x2, x8, asr 63\n\t" - "and x8, x8, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x5, x5, x4\n\t" + "adcs x6, x6, x9\n\t" + "adcs x7, x7, x10\n\t" + "adc x8, x8, x11\n\t" + /* Reduce if top bit set */ + "mov x2, #19\n\t" + "and x3, x2, x8, asr 63\n\t" + "adds x5, x5, x3\n\t" "adcs x6, x6, xzr\n\t" + "and x8, x8, #0x7fffffffffffffff\n\t" "adcs x7, x7, xzr\n\t" "adc x8, x8, xzr\n\t" /* Store */ @@ -792,120 +775,199 @@ void fe_invert(fe r, const fe a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x50\n\t" -#ifndef NDEBUG - "add x1, x29, #48\n\t" -#endif /* !NDEBUG */ -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x20, #3\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" + /* Loop: 5 times */ + "mov x20, #5\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" "\n" "L_fe_invert1_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "subs x20, x20, #1\n\t" - "bcs L_fe_invert1_%=\n\t" - "add x0, x29, #48\n\t" -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #48\n\t" -#ifndef __APPLE__ - "bl fe_mul\n\t" -#else - "bl _fe_mul\n\t" -#endif /* __APPLE__ */ - "add x0, x29, #0x50\n\t" - "add x1, x29, #48\n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x20, #8\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" - "\n" - "L_fe_invert2_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "subs x20, x20, #1\n\t" - "bcs L_fe_invert2_%=\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #48\n\t" -#ifndef __APPLE__ - "bl fe_mul\n\t" -#else - "bl _fe_mul\n\t" -#endif /* __APPLE__ */ - "add x0, x29, #0x70\n\t" -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x20, #18\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x70\n\t" - "\n" - "L_fe_invert3_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bcs L_fe_invert3_%=\n\t" - "add x0, x29, #0x50\n\t" + "bne L_fe_invert1_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" #ifndef NDEBUG - "add x1, x29, #0x70\n\t" + "add x0, x29, #48\n\t" #endif /* !NDEBUG */ - "add x2, x29, #0x50\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #48\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x20, #9\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" + /* Loop: 10 times */ + "mov x20, #10\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" "\n" - "L_fe_invert4_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_invert2_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bcs L_fe_invert4_%=\n\t" - "add x0, x29, #48\n\t" + "bne L_fe_invert2_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + "add x0, x29, #0x50\n\t" #ifndef NDEBUG "add x1, x29, #0x50\n\t" #endif /* !NDEBUG */ @@ -915,87 +977,298 @@ void fe_invert(fe r, const fe a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x50\n\t" - "add x1, x29, #48\n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x20, #48\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" + /* Loop: 20 times */ + "mov x20, #20\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" "\n" - "L_fe_invert5_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_invert3_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bcs L_fe_invert5_%=\n\t" + "bne L_fe_invert3_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" #ifndef NDEBUG "add x0, x29, #0x50\n\t" #endif /* !NDEBUG */ -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #48\n\t" + "add x1, x29, #0x70\n\t" + "add x2, x29, #0x50\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x70\n\t" -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x20, #0x62\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x70\n\t" + /* Loop: 10 times */ + "mov x20, #10\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" "\n" - "L_fe_invert6_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_invert4_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bcs L_fe_invert6_%=\n\t" - "add x0, x29, #0x50\n\t" -#ifndef NDEBUG - "add x1, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #0x50\n\t" + "bne L_fe_invert4_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #48\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x20, #49\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" + /* Loop: 50 times */ + "mov x20, #50\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" "\n" - "L_fe_invert7_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_invert5_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bcs L_fe_invert7_%=\n\t" - "add x0, x29, #48\n\t" + "bne L_fe_invert5_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + "add x0, x29, #0x50\n\t" #ifndef NDEBUG "add x1, x29, #0x50\n\t" #endif /* !NDEBUG */ @@ -1005,463 +1278,679 @@ void fe_invert(fe r, const fe a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x20, #4\n\t" -#ifndef NDEBUG - "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #48\n\t" + /* Loop: 100 times */ + "mov x20, #0x64\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" "\n" - "L_fe_invert8_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_invert6_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bcs L_fe_invert8_%=\n\t" - "ldr x0, [x29, #144]\n\t" + "bne L_fe_invert6_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" #ifndef NDEBUG - "add x1, x29, #48\n\t" + "add x0, x29, #0x50\n\t" #endif /* !NDEBUG */ - "add x2, x29, #16\n\t" + "add x1, x29, #0x70\n\t" + "add x2, x29, #0x50\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "ldp x29, x30, [sp], #0xa0\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x2", "x20" - ); -} - -int curve25519(byte* r, const byte* n, const byte* a) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-192]!\n\t" - "add x29, sp, #0\n\t" + /* Loop: 50 times */ + "mov x20, #50\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" + "\n" + "L_fe_invert7_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x20, x20, #1\n\t" + "bne L_fe_invert7_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + "add x0, x29, #48\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #48\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + /* Loop: 5 times */ + "mov x20, #5\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" + "\n" + "L_fe_invert8_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x20, x20, #1\n\t" + "bne L_fe_invert8_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" + "ldr x0, [x29, #144]\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #16\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + "ldp x29, x30, [sp], #0xa0\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "x2", "x20", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" + ); +} + +int curve25519(byte* r, const byte* n, const byte* a) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-192]!\n\t" + "add x29, sp, #0\n\t" "mov x23, xzr\n\t" "str %x[r], [x29, #176]\n\t" "str %x[a], [x29, #184]\n\t" - /* Copy */ "ldp x6, x7, [%x[a]]\n\t" "ldp x8, x9, [%x[a], #16]\n\t" - "stp x6, x7, [x29, #80]\n\t" - "stp x8, x9, [x29, #96]\n\t" - /* Set one */ - "mov %x[a], #1\n\t" - "stp %x[a], xzr, [%x[r]]\n\t" - "stp xzr, xzr, [%x[r], #16]\n\t" + "mov x10, #1\n\t" + "mov x11, xzr\n\t" + "mov x12, xzr\n\t" + "mov x13, xzr\n\t" + "stp x10, x11, [%x[r]]\n\t" + "stp x12, x13, [%x[r], #16]\n\t" /* Set zero */ "stp xzr, xzr, [x29, #16]\n\t" "stp xzr, xzr, [x29, #32]\n\t" - /* Set one */ - "mov %x[a], #1\n\t" - "stp %x[a], xzr, [x29, #48]\n\t" - "stp xzr, xzr, [x29, #64]\n\t" - "mov x25, #62\n\t" - "mov x24, #24\n\t" - "\n" - "L_curve25519_words_%=: \n\t" + "mov x24, #0xfe\n\t" "\n" "L_curve25519_bits_%=: \n\t" - "ldr %x[a], [%x[n], x24]\n\t" - "lsr %x[a], %x[a], x25\n\t" - "and %x[a], %x[a], #1\n\t" - "eor x23, x23, %x[a]\n\t" + "lsr x3, x24, #6\n\t" + "and x4, x24, #63\n\t" + "ldr x5, [%x[n], x3, LSL 3]\n\t" + "lsr x5, x5, x4\n\t" + "eor x23, x23, x5\n\t" /* Conditional Swap */ - "cmp x23, #1\n\t" + "subs xzr, xzr, x23, lsl 63\n\t" + "ldp x25, x26, [x29, #16]\n\t" + "ldp x27, x28, [x29, #32]\n\t" + "csel x19, x25, x10, ne\n\t" + "csel x25, x10, x25, ne\n\t" + "csel x20, x26, x11, ne\n\t" + "csel x26, x11, x26, ne\n\t" + "csel x21, x27, x12, ne\n\t" + "csel x27, x12, x27, ne\n\t" + "csel x22, x28, x13, ne\n\t" + "csel x28, x13, x28, ne\n\t" + /* Conditional Swap */ + "subs xzr, xzr, x23, lsl 63\n\t" "ldp x10, x11, [%x[r]]\n\t" "ldp x12, x13, [%x[r], #16]\n\t" - "ldp x6, x7, [x29, #80]\n\t" - "ldp x8, x9, [x29, #96]\n\t" - "csel x14, x10, x6, eq\n\t" - "csel x10, x6, x10, eq\n\t" - "csel x15, x11, x7, eq\n\t" - "csel x11, x7, x11, eq\n\t" - "csel x16, x12, x8, eq\n\t" - "csel x12, x8, x12, eq\n\t" - "csel x17, x13, x9, eq\n\t" - "csel x13, x9, x13, eq\n\t" - /* Conditional Swap */ - "cmp x23, #1\n\t" - "ldp x19, x20, [x29, #16]\n\t" - "ldp x21, x22, [x29, #32]\n\t" - "ldp x6, x7, [x29, #48]\n\t" - "ldp x8, x9, [x29, #64]\n\t" - "csel x5, x19, x6, eq\n\t" - "csel x19, x6, x19, eq\n\t" - "csel x26, x20, x7, eq\n\t" - "csel x20, x7, x20, eq\n\t" - "csel x27, x21, x8, eq\n\t" - "csel x21, x8, x21, eq\n\t" - "csel x28, x22, x9, eq\n\t" - "csel x22, x9, x22, eq\n\t" - "mov x23, %x[a]\n\t" + "csel x14, x10, x6, ne\n\t" + "csel x10, x6, x10, ne\n\t" + "csel x15, x11, x7, ne\n\t" + "csel x11, x7, x11, ne\n\t" + "csel x16, x12, x8, ne\n\t" + "csel x12, x8, x12, ne\n\t" + "csel x17, x13, x9, ne\n\t" + "csel x13, x9, x13, ne\n\t" + "mov x23, x5\n\t" /* Add */ - "adds x6, x10, x19\n\t" - "adcs x7, x11, x20\n\t" - "adcs x8, x12, x21\n\t" - "adc x9, x13, x22\n\t" - "mov x3, #-19\n\t" - "asr %x[a], x9, #63\n\t" - /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "adds x6, x10, x25\n\t" + "adcs x7, x11, x26\n\t" + "adcs x8, x12, x27\n\t" + "adcs x9, x13, x28\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x9, #63\n\t" + "mul x3, x5, x3\n\t" /* Sub modulus (if overflow) */ - "subs x6, x6, x3\n\t" - "sbcs x7, x7, %x[a]\n\t" - "sbcs x8, x8, %x[a]\n\t" - "sbc x9, x9, x4\n\t" + "adds x6, x6, x3\n\t" + "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" /* Sub */ - "subs x19, x10, x19\n\t" - "sbcs x20, x11, x20\n\t" - "sbcs x21, x12, x21\n\t" - "sbcs x22, x13, x22\n\t" + "subs x25, x10, x25\n\t" + "sbcs x26, x11, x26\n\t" + "sbcs x27, x12, x27\n\t" + "sbcs x28, x13, x28\n\t" + "csetm x5, cc\n\t" "mov x3, #-19\n\t" - "csetm %x[a], cc\n\t" - /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "extr x5, x5, x28, #63\n\t" + "mul x3, x5, x3\n\t" /* Add modulus (if underflow) */ - "adds x19, x19, x3\n\t" - "adcs x20, x20, %x[a]\n\t" - "adcs x21, x21, %x[a]\n\t" - "adc x22, x22, x4\n\t" - "stp x19, x20, [x29, #144]\n\t" - "stp x21, x22, [x29, #160]\n\t" + "subs x25, x25, x3\n\t" + "sbcs x26, x26, xzr\n\t" + "and x28, x28, #0x7fffffffffffffff\n\t" + "sbcs x27, x27, xzr\n\t" + "sbc x28, x28, xzr\n\t" + "stp x25, x26, [x29, #80]\n\t" + "stp x27, x28, [x29, #96]\n\t" /* Add */ - "adds x10, x14, x5\n\t" - "adcs x11, x15, x26\n\t" - "adcs x12, x16, x27\n\t" - "adc x13, x17, x28\n\t" - "mov x3, #-19\n\t" - "asr %x[a], x13, #63\n\t" - /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "adds x10, x14, x19\n\t" + "adcs x11, x15, x20\n\t" + "adcs x12, x16, x21\n\t" + "adcs x13, x17, x22\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x3, x5, x3\n\t" /* Sub modulus (if overflow) */ - "subs x10, x10, x3\n\t" - "sbcs x11, x11, %x[a]\n\t" - "sbcs x12, x12, %x[a]\n\t" - "sbc x13, x13, x4\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, xzr\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" /* Sub */ - "subs x14, x14, x5\n\t" - "sbcs x15, x15, x26\n\t" - "sbcs x16, x16, x27\n\t" - "sbcs x17, x17, x28\n\t" + "subs x14, x14, x19\n\t" + "sbcs x15, x15, x20\n\t" + "sbcs x16, x16, x21\n\t" + "sbcs x17, x17, x22\n\t" + "csetm x5, cc\n\t" "mov x3, #-19\n\t" - "csetm %x[a], cc\n\t" - /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x3, x5, x3\n\t" /* Add modulus (if underflow) */ - "adds x14, x14, x3\n\t" - "adcs x15, x15, %x[a]\n\t" - "adcs x16, x16, %x[a]\n\t" - "adc x17, x17, x4\n\t" + "subs x14, x14, x3\n\t" + "sbcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "sbcs x16, x16, xzr\n\t" + "sbc x17, x17, xzr\n\t" /* Multiply */ - /* A[0] * B[0] */ - "mul x19, x14, x6\n\t" + /* A[0] * B[0] */ "umulh x20, x14, x6\n\t" - /* A[0] * B[1] */ - "mul x3, x14, x7\n\t" - "umulh x21, x14, x7\n\t" - "adds x20, x20, x3\n\t" - "adc x21, x21, xzr\n\t" - /* A[1] * B[0] */ + "mul x19, x14, x6\n\t" + /* A[2] * B[0] */ + "umulh x22, x16, x6\n\t" + "mul x21, x16, x6\n\t" + /* A[1] * B[0] */ "mul x3, x15, x6\n\t" + "adds x20, x20, x3\n\t" "umulh x4, x15, x6\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x9\n\t" + "mul x25, x15, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x7\n\t" "adds x20, x20, x3\n\t" + "umulh x4, x14, x7\n\t" "adcs x21, x21, x4\n\t" - "adc x22, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x14, x8\n\t" - "umulh x4, x14, x8\n\t" - "adds x21, x21, x3\n\t" - "adc x22, x22, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x15, x7\n\t" - "umulh x4, x15, x7\n\t" - "adds x21, x21, x3\n\t" - "adcs x22, x22, x4\n\t" - "adc %x[a], xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x16, x6\n\t" - "umulh x4, x16, x6\n\t" - "adds x21, x21, x3\n\t" - "adcs x22, x22, x4\n\t" - "adc %x[a], %x[a], xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x14, x9\n\t" - "umulh x4, x14, x9\n\t" - "adds x22, x22, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x15, x8\n\t" - "umulh x4, x15, x8\n\t" - "adds x22, x22, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[2] * B[1] */ + /* A[2] * B[1] */ "mul x3, x16, x7\n\t" + "adcs x22, x22, x3\n\t" "umulh x4, x16, x7\n\t" - "adds x22, x22, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" + "adcs x25, x25, x4\n\t" "adc x26, x26, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x17, x6\n\t" - "umulh x4, x17, x6\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x8\n\t" "adds x22, x22, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x15, x9\n\t" - "umulh x4, x15, x9\n\t" - "adds %x[a], %x[a], x3\n\t" - "adcs x26, x26, x4\n\t" + "umulh x4, x15, x8\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" "adc x27, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x16, x8\n\t" - "umulh x4, x16, x8\n\t" - "adds %x[a], %x[a], x3\n\t" - "adcs x26, x26, x4\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x8\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x14, x8\n\t" + "adcs x22, x22, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" "adc x27, x27, xzr\n\t" - /* A[3] * B[1] */ + /* A[1] * B[1] */ + "mul x3, x15, x7\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x15, x7\n\t" + "adcs x22, x22, x4\n\t" + /* A[3] * B[1] */ "mul x3, x17, x7\n\t" + "adcs x25, x25, x3\n\t" "umulh x4, x17, x7\n\t" - "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" - /* A[2] * B[3] */ + /* A[2] * B[2] */ + "mul x3, x16, x8\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x8\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x9\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x9\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x9\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x14, x9\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ "mul x3, x16, x9\n\t" + "adcs x26, x26, x3\n\t" "umulh x4, x16, x9\n\t" - "adds x26, x26, x3\n\t" "adcs x27, x27, x4\n\t" - "adc x28, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x3, x17, x8\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x6\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x17, x6\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x17, x8\n\t" + "adcs x26, x26, x3\n\t" "umulh x4, x17, x8\n\t" - "adds x26, x26, x3\n\t" "adcs x27, x27, x4\n\t" "adc x28, x28, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x17, x9\n\t" - "umulh x4, x17, x9\n\t" - "adds x27, x27, x3\n\t" - "adc x28, x28, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x28, x28, x27, #63\n\t" - "extr x27, x27, x26, #63\n\t" - "extr x26, x26, %x[a], #63\n\t" - "extr %x[a], %x[a], x22, #63\n\t" - "and x22, x22, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, %x[a]\n\t" - "umulh %x[a], x3, %x[a]\n\t" - "adds x19, x19, x4\n\t" - "mul x4, x3, x26\n\t" - "umulh x26, x3, x26\n\t" - "adcs x20, x20, x4\n\t" - "mul x4, x3, x27\n\t" - "umulh x27, x3, x27\n\t" - "adcs x21, x21, x4\n\t" + "mov x3, #38\n\t" "mul x4, x3, x28\n\t" + "adds x22, x22, x4\n\t" "umulh x5, x3, x28\n\t" - "adcs x22, x22, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x20, x20, %x[a]\n\t" - "adcs x21, x21, x26\n\t" - "adcs x22, x22, x27\n\t" "adc x5, x5, xzr\n\t" - /* Overflow */ + "mov x3, #19\n\t" "extr x5, x5, x22, #63\n\t" "mul x5, x5, x3\n\t" "and x22, x22, #0x7fffffffffffffff\n\t" - "adds x19, x19, x5\n\t" - "adcs x20, x20, xzr\n\t" - "adcs x21, x21, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x19, x19, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x20, x20, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x21, x21, x4\n\t" + "umulh x27, x3, x27\n\t" "adc x22, x22, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x22, asr 63\n\t" - "and x22, x22, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x19, x19, x5\n\t" - "adcs x20, x20, xzr\n\t" - "adcs x21, x21, xzr\n\t" - "adc x22, x22, xzr\n\t" + "adcs x20, x20, x25\n\t" + "adcs x21, x21, x26\n\t" + "adc x22, x22, x27\n\t" /* Store */ - "stp x19, x20, [x29, #112]\n\t" - "stp x21, x22, [x29, #128]\n\t" + "stp x19, x20, [x29, #48]\n\t" + "stp x21, x22, [x29, #64]\n\t" /* Multiply */ - "ldp %x[a], x26, [x29, #144]\n\t" - "ldp x27, x28, [x29, #160]\n\t" - /* A[0] * B[0] */ - "mul x19, x10, %x[a]\n\t" - "umulh x20, x10, %x[a]\n\t" - /* A[0] * B[1] */ - "mul x3, x10, x26\n\t" - "umulh x21, x10, x26\n\t" + "ldp x25, x26, [x29, #80]\n\t" + "ldp x27, x28, [x29, #96]\n\t" + /* A[0] * B[0] */ + "umulh x20, x10, x25\n\t" + "mul x19, x10, x25\n\t" + /* A[2] * B[0] */ + "umulh x22, x12, x25\n\t" + "mul x21, x12, x25\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x25\n\t" "adds x20, x20, x3\n\t" - "adc x21, x21, xzr\n\t" - /* A[1] * B[0] */ - "mul x3, x11, %x[a]\n\t" - "umulh x4, x11, %x[a]\n\t" + "umulh x4, x11, x25\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[1] * B[3] */ + "umulh x15, x11, x28\n\t" + "mul x14, x11, x28\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x26\n\t" "adds x20, x20, x3\n\t" + "umulh x4, x10, x26\n\t" "adcs x21, x21, x4\n\t" - "adc x22, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x10, x27\n\t" - "umulh x4, x10, x27\n\t" - "adds x21, x21, x3\n\t" - "adc x22, x22, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x11, x26\n\t" - "umulh x4, x11, x26\n\t" - "adds x21, x21, x3\n\t" - "adcs x22, x22, x4\n\t" - "adc x14, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x12, %x[a]\n\t" - "umulh x4, x12, %x[a]\n\t" - "adds x21, x21, x3\n\t" - "adcs x22, x22, x4\n\t" - "adc x14, x14, xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x10, x28\n\t" - "umulh x4, x10, x28\n\t" - "adds x22, x22, x3\n\t" - "adcs x14, x14, x4\n\t" - "adc x15, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x11, x27\n\t" - "umulh x4, x11, x27\n\t" - "adds x22, x22, x3\n\t" - "adcs x14, x14, x4\n\t" - "adc x15, x15, xzr\n\t" - /* A[2] * B[1] */ + /* A[2] * B[1] */ "mul x3, x12, x26\n\t" + "adcs x22, x22, x3\n\t" "umulh x4, x12, x26\n\t" - "adds x22, x22, x3\n\t" "adcs x14, x14, x4\n\t" "adc x15, x15, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x13, %x[a]\n\t" - "umulh x4, x13, %x[a]\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x27\n\t" "adds x22, x22, x3\n\t" + "umulh x4, x11, x27\n\t" "adcs x14, x14, x4\n\t" - "adc x15, x15, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x11, x28\n\t" - "umulh x4, x11, x28\n\t" - "adds x14, x14, x3\n\t" - "adcs x15, x15, x4\n\t" + "adcs x15, x15, xzr\n\t" "adc x16, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x12, x27\n\t" - "umulh x4, x12, x27\n\t" - "adds x14, x14, x3\n\t" - "adcs x15, x15, x4\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x27\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x10, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x14, x14, xzr\n\t" + "adcs x15, x15, xzr\n\t" "adc x16, x16, xzr\n\t" - /* A[3] * B[1] */ + /* A[1] * B[1] */ + "mul x3, x11, x26\n\t" + "adds x21, x21, x3\n\t" + "umulh x4, x11, x26\n\t" + "adcs x22, x22, x4\n\t" + /* A[3] * B[1] */ "mul x3, x13, x26\n\t" + "adcs x14, x14, x3\n\t" "umulh x4, x13, x26\n\t" - "adds x14, x14, x3\n\t" "adcs x15, x15, x4\n\t" "adc x16, x16, xzr\n\t" - /* A[2] * B[3] */ + /* A[2] * B[2] */ + "mul x3, x12, x27\n\t" + "adds x14, x14, x3\n\t" + "umulh x4, x12, x27\n\t" + "adcs x15, x15, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x28\n\t" + "adcs x16, x16, x3\n\t" + "umulh x17, x13, x28\n\t" + "adc x17, x17, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x28\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x10, x28\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * B[3] */ "mul x3, x12, x28\n\t" + "adcs x15, x15, x3\n\t" "umulh x4, x12, x28\n\t" - "adds x15, x15, x3\n\t" "adcs x16, x16, x4\n\t" - "adc x17, xzr, xzr\n\t" - /* A[3] * B[2] */ + "adc x17, x17, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x25\n\t" + "adds x22, x22, x3\n\t" + "umulh x4, x13, x25\n\t" + "adcs x14, x14, x4\n\t" + /* A[3] * B[2] */ "mul x3, x13, x27\n\t" + "adcs x15, x15, x3\n\t" "umulh x4, x13, x27\n\t" - "adds x15, x15, x3\n\t" "adcs x16, x16, x4\n\t" "adc x17, x17, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x13, x28\n\t" - "umulh x4, x13, x28\n\t" - "adds x16, x16, x3\n\t" - "adc x17, x17, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x15, #63\n\t" - "extr x15, x15, x14, #63\n\t" - "extr x14, x14, x22, #63\n\t" - "and x22, x22, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x22, x22, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" "mov x3, #19\n\t" + "extr x5, x5, x22, #63\n\t" + "mul x5, x5, x3\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" "mul x4, x3, x14\n\t" - "umulh x14, x3, x14\n\t" "adds x19, x19, x4\n\t" + "umulh x14, x3, x14\n\t" "mul x4, x3, x15\n\t" - "umulh x15, x3, x15\n\t" "adcs x20, x20, x4\n\t" + "umulh x15, x3, x15\n\t" "mul x4, x3, x16\n\t" - "umulh x16, x3, x16\n\t" "adcs x21, x21, x4\n\t" - "mul x4, x3, x17\n\t" - "umulh x5, x3, x17\n\t" - "adcs x22, x22, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x20, x20, x14\n\t" - "adcs x21, x21, x15\n\t" - "adcs x22, x22, x16\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x22, #63\n\t" - "mul x5, x5, x3\n\t" - "and x22, x22, #0x7fffffffffffffff\n\t" - "adds x19, x19, x5\n\t" - "adcs x20, x20, xzr\n\t" - "adcs x21, x21, xzr\n\t" + "umulh x16, x3, x16\n\t" "adc x22, x22, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x22, asr 63\n\t" - "and x22, x22, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x19, x19, x5\n\t" - "adcs x20, x20, xzr\n\t" - "adcs x21, x21, xzr\n\t" - "adc x22, x22, xzr\n\t" - /* Store */ + "adcs x20, x20, x14\n\t" + "adcs x21, x21, x15\n\t" + "adc x22, x22, x16\n\t" /* Square */ /* A[0] * A[1] */ - "mul x11, %x[a], x26\n\t" - "umulh x12, %x[a], x26\n\t" + "umulh x12, x25, x26\n\t" + "mul x11, x25, x26\n\t" + /* A[0] * A[3] */ + "umulh x14, x25, x28\n\t" + "mul x13, x25, x28\n\t" /* A[0] * A[2] */ - "mul x3, %x[a], x27\n\t" - "umulh x13, %x[a], x27\n\t" + "mul x3, x25, x27\n\t" "adds x12, x12, x3\n\t" - "adc x13, x13, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, %x[a], x28\n\t" - "umulh x14, %x[a], x28\n\t" - "adds x13, x13, x3\n\t" - "adc x14, x14, xzr\n\t" + "umulh x4, x25, x27\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x26, x28\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x26, x28\n\t" + "adc x15, x15, xzr\n\t" /* A[1] * A[2] */ "mul x3, x26, x27\n\t" - "umulh x4, x26, x27\n\t" "adds x13, x13, x3\n\t" + "umulh x4, x26, x27\n\t" "adcs x14, x14, x4\n\t" - "adc x15, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x26, x28\n\t" - "umulh x4, x26, x28\n\t" - "adds x14, x14, x3\n\t" - "adc x15, x15, x4\n\t" /* A[2] * A[3] */ "mul x3, x27, x28\n\t" + "adcs x15, x15, x3\n\t" "umulh x16, x27, x28\n\t" - "adds x15, x15, x3\n\t" "adc x16, x16, xzr\n\t" /* Double */ "adds x11, x11, x11\n\t" @@ -1472,304 +1961,241 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x16, x16, x16\n\t" "adc x17, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x10, %x[a], %x[a]\n\t" - "umulh x5, %x[a], %x[a]\n\t" + "umulh x4, x25, x25\n\t" + "mul x10, x25, x25\n\t" /* A[1] * A[1] */ "mul x3, x26, x26\n\t" + "adds x11, x11, x4\n\t" "umulh x4, x26, x26\n\t" - "adds x11, x11, x5\n\t" "adcs x12, x12, x3\n\t" - "adc x5, x4, xzr\n\t" /* A[2] * A[2] */ "mul x3, x27, x27\n\t" + "adcs x13, x13, x4\n\t" "umulh x4, x27, x27\n\t" - "adds x13, x13, x5\n\t" "adcs x14, x14, x3\n\t" - "adc x5, x4, xzr\n\t" /* A[3] * A[3] */ "mul x3, x28, x28\n\t" + "adcs x15, x15, x4\n\t" "umulh x4, x28, x28\n\t" - "adds x15, x15, x5\n\t" "adcs x16, x16, x3\n\t" "adc x17, x17, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x15, #63\n\t" - "extr x15, x15, x14, #63\n\t" - "extr x14, x14, x13, #63\n\t" - "and x13, x13, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" "mul x4, x3, x14\n\t" - "umulh x14, x3, x14\n\t" "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" "mul x4, x3, x15\n\t" - "umulh x15, x3, x15\n\t" "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" "mul x4, x3, x16\n\t" - "umulh x16, x3, x16\n\t" "adcs x12, x12, x4\n\t" - "mul x4, x3, x17\n\t" - "umulh x5, x3, x17\n\t" - "adcs x13, x13, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x11, x11, x14\n\t" - "adcs x12, x12, x15\n\t" - "adcs x13, x13, x16\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x13, #63\n\t" - "mul x5, x5, x3\n\t" - "and x13, x13, #0x7fffffffffffffff\n\t" - "adds x10, x10, x5\n\t" - "adcs x11, x11, xzr\n\t" - "adcs x12, x12, xzr\n\t" + "umulh x16, x3, x16\n\t" "adc x13, x13, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x13, asr 63\n\t" - "and x13, x13, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x10, x10, x5\n\t" - "adcs x11, x11, xzr\n\t" - "adcs x12, x12, xzr\n\t" - "adc x13, x13, xzr\n\t" - /* Store */ + "adcs x11, x11, x14\n\t" + "adcs x12, x12, x15\n\t" + "adc x13, x13, x16\n\t" /* Square */ /* A[0] * A[1] */ - "mul x15, x6, x7\n\t" "umulh x16, x6, x7\n\t" + "mul x15, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x25, x6, x9\n\t" + "mul x17, x6, x9\n\t" /* A[0] * A[2] */ "mul x3, x6, x8\n\t" - "umulh x17, x6, x8\n\t" "adds x16, x16, x3\n\t" - "adc x17, x17, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, x6, x9\n\t" - "umulh %x[a], x6, x9\n\t" - "adds x17, x17, x3\n\t" - "adc %x[a], %x[a], xzr\n\t" + "umulh x4, x6, x8\n\t" + "adcs x17, x17, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x7, x9\n\t" + "adc x26, x26, xzr\n\t" /* A[1] * A[2] */ "mul x3, x7, x8\n\t" - "umulh x4, x7, x8\n\t" "adds x17, x17, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x7, x9\n\t" - "umulh x4, x7, x9\n\t" - "adds %x[a], %x[a], x3\n\t" - "adc x26, x26, x4\n\t" + "umulh x4, x7, x8\n\t" + "adcs x25, x25, x4\n\t" /* A[2] * A[3] */ "mul x3, x8, x9\n\t" + "adcs x26, x26, x3\n\t" "umulh x27, x8, x9\n\t" - "adds x26, x26, x3\n\t" "adc x27, x27, xzr\n\t" /* Double */ "adds x15, x15, x15\n\t" "adcs x16, x16, x16\n\t" "adcs x17, x17, x17\n\t" - "adcs %x[a], %x[a], %x[a]\n\t" + "adcs x25, x25, x25\n\t" "adcs x26, x26, x26\n\t" "adcs x27, x27, x27\n\t" "adc x28, xzr, xzr\n\t" /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" "mul x14, x6, x6\n\t" - "umulh x5, x6, x6\n\t" /* A[1] * A[1] */ "mul x3, x7, x7\n\t" + "adds x15, x15, x4\n\t" "umulh x4, x7, x7\n\t" - "adds x15, x15, x5\n\t" "adcs x16, x16, x3\n\t" - "adc x5, x4, xzr\n\t" /* A[2] * A[2] */ "mul x3, x8, x8\n\t" + "adcs x17, x17, x4\n\t" "umulh x4, x8, x8\n\t" - "adds x17, x17, x5\n\t" - "adcs %x[a], %x[a], x3\n\t" - "adc x5, x4, xzr\n\t" + "adcs x25, x25, x3\n\t" /* A[3] * A[3] */ "mul x3, x9, x9\n\t" + "adcs x26, x26, x4\n\t" "umulh x4, x9, x9\n\t" - "adds x26, x26, x5\n\t" "adcs x27, x27, x3\n\t" "adc x28, x28, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x28, x28, x27, #63\n\t" - "extr x27, x27, x26, #63\n\t" - "extr x26, x26, %x[a], #63\n\t" - "extr %x[a], %x[a], x17, #63\n\t" - "and x17, x17, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, %x[a]\n\t" - "umulh %x[a], x3, %x[a]\n\t" - "adds x14, x14, x4\n\t" - "mul x4, x3, x26\n\t" - "umulh x26, x3, x26\n\t" - "adcs x15, x15, x4\n\t" - "mul x4, x3, x27\n\t" - "umulh x27, x3, x27\n\t" - "adcs x16, x16, x4\n\t" + "mov x3, #38\n\t" "mul x4, x3, x28\n\t" + "adds x17, x17, x4\n\t" "umulh x5, x3, x28\n\t" - "adcs x17, x17, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x15, x15, %x[a]\n\t" - "adcs x16, x16, x26\n\t" - "adcs x17, x17, x27\n\t" "adc x5, x5, xzr\n\t" - /* Overflow */ + "mov x3, #19\n\t" "extr x5, x5, x17, #63\n\t" "mul x5, x5, x3\n\t" "and x17, x17, #0x7fffffffffffffff\n\t" - "adds x14, x14, x5\n\t" - "adcs x15, x15, xzr\n\t" - "adcs x16, x16, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x14, x14, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x15, x15, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x16, x16, x4\n\t" + "umulh x27, x3, x27\n\t" "adc x17, x17, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x17, asr 63\n\t" - "and x17, x17, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x14, x14, x5\n\t" - "adcs x15, x15, xzr\n\t" - "adcs x16, x16, xzr\n\t" - "adc x17, x17, xzr\n\t" - /* Store */ + "adcs x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, x27\n\t" /* Multiply */ - /* A[0] * B[0] */ - "mul x6, x14, x10\n\t" + /* A[0] * B[0] */ "umulh x7, x14, x10\n\t" - /* A[0] * B[1] */ - "mul x3, x14, x11\n\t" - "umulh x8, x14, x11\n\t" - "adds x7, x7, x3\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[0] */ + "mul x6, x14, x10\n\t" + /* A[2] * B[0] */ + "umulh x9, x16, x10\n\t" + "mul x8, x16, x10\n\t" + /* A[1] * B[0] */ "mul x3, x15, x10\n\t" + "adds x7, x7, x3\n\t" "umulh x4, x15, x10\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x13\n\t" + "mul x25, x15, x13\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x11\n\t" "adds x7, x7, x3\n\t" + "umulh x4, x14, x11\n\t" "adcs x8, x8, x4\n\t" - "adc x9, xzr, xzr\n\t" - /* A[0] * B[2] */ + /* A[2] * B[1] */ + "mul x3, x16, x11\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x16, x11\n\t" + "adcs x25, x25, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x12\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x15, x12\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, xzr, xzr\n\t" + /* A[0] * B[2] */ "mul x3, x14, x12\n\t" - "umulh x4, x14, x12\n\t" "adds x8, x8, x3\n\t" - "adc x9, x9, x4\n\t" - /* A[1] * B[1] */ + "umulh x4, x14, x12\n\t" + "adcs x9, x9, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" + /* A[1] * B[1] */ "mul x3, x15, x11\n\t" - "umulh x4, x15, x11\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc %x[a], xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x16, x10\n\t" - "umulh x4, x16, x10\n\t" "adds x8, x8, x3\n\t" + "umulh x4, x15, x11\n\t" "adcs x9, x9, x4\n\t" - "adc %x[a], %x[a], xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x14, x13\n\t" - "umulh x4, x14, x13\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x15, x12\n\t" - "umulh x4, x15, x12\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[2] * B[1] */ - "mul x3, x16, x11\n\t" - "umulh x4, x16, x11\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x17, x10\n\t" - "umulh x4, x17, x10\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x15, x13\n\t" - "umulh x4, x15, x13\n\t" - "adds %x[a], %x[a], x3\n\t" - "adcs x26, x26, x4\n\t" - "adc x27, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x16, x12\n\t" - "umulh x4, x16, x12\n\t" - "adds %x[a], %x[a], x3\n\t" - "adcs x26, x26, x4\n\t" - "adc x27, x27, xzr\n\t" - /* A[3] * B[1] */ + /* A[3] * B[1] */ "mul x3, x17, x11\n\t" + "adcs x25, x25, x3\n\t" "umulh x4, x17, x11\n\t" - "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" - /* A[2] * B[3] */ + /* A[2] * B[2] */ + "mul x3, x16, x12\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x12\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x13\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x13\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x13\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x14, x13\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ "mul x3, x16, x13\n\t" + "adcs x26, x26, x3\n\t" "umulh x4, x16, x13\n\t" - "adds x26, x26, x3\n\t" "adcs x27, x27, x4\n\t" - "adc x28, xzr, xzr\n\t" - /* A[3] * B[2] */ + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x10\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x17, x10\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ "mul x3, x17, x12\n\t" + "adcs x26, x26, x3\n\t" "umulh x4, x17, x12\n\t" - "adds x26, x26, x3\n\t" "adcs x27, x27, x4\n\t" "adc x28, x28, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x17, x13\n\t" - "umulh x4, x17, x13\n\t" - "adds x27, x27, x3\n\t" - "adc x28, x28, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x28, x28, x27, #63\n\t" - "extr x27, x27, x26, #63\n\t" - "extr x26, x26, %x[a], #63\n\t" - "extr %x[a], %x[a], x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, %x[a]\n\t" - "umulh %x[a], x3, %x[a]\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x26\n\t" - "umulh x26, x3, x26\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x27\n\t" - "umulh x27, x3, x27\n\t" - "adcs x8, x8, x4\n\t" + "mov x3, #38\n\t" "mul x4, x3, x28\n\t" + "adds x9, x9, x4\n\t" "umulh x5, x3, x28\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, %x[a]\n\t" - "adcs x8, x8, x26\n\t" - "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" - /* Overflow */ + "mov x3, #19\n\t" "extr x5, x5, x9, #63\n\t" "mul x5, x5, x3\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x6, x6, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "umulh x27, x3, x27\n\t" "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x9, asr 63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" + "adcs x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, x27\n\t" /* Store */ "stp x6, x7, [%x[r]]\n\t" "stp x8, x9, [%x[r], #16]\n\t" @@ -1778,576 +2204,479 @@ int curve25519(byte* r, const byte* n, const byte* a) "sbcs x15, x15, x11\n\t" "sbcs x16, x16, x12\n\t" "sbcs x17, x17, x13\n\t" + "csetm x5, cc\n\t" "mov x3, #-19\n\t" - "csetm %x[a], cc\n\t" /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x3, x5, x3\n\t" /* Add modulus (if underflow) */ - "adds x14, x14, x3\n\t" - "adcs x15, x15, %x[a]\n\t" - "adcs x16, x16, %x[a]\n\t" - "adc x17, x17, x4\n\t" + "subs x14, x14, x3\n\t" + "sbcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "sbcs x16, x16, xzr\n\t" + "sbc x17, x17, xzr\n\t" /* Multiply by 121666 */ "mov x5, #0xdb42\n\t" "movk x5, #1, lsl 16\n\t" "mul x6, x14, x5\n\t" "umulh x7, x14, x5\n\t" "mul x3, x15, x5\n\t" - "umulh x4, x15, x5\n\t" + "umulh x8, x15, x5\n\t" "adds x7, x7, x3\n\t" - "adc x8, xzr, x4\n\t" + "adc x8, x8, xzr\n\t" "mul x3, x16, x5\n\t" - "umulh x4, x16, x5\n\t" + "umulh x9, x16, x5\n\t" "adds x8, x8, x3\n\t" - "adc x9, xzr, x4\n\t" + "adc x9, x9, xzr\n\t" "mul x3, x17, x5\n\t" "umulh x4, x17, x5\n\t" "adds x9, x9, x3\n\t" - "adc x4, xzr, x4\n\t" + "adc x4, x4, xzr\n\t" "mov x5, #19\n\t" "extr x4, x4, x9, #63\n\t" "mul x4, x4, x5\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" "adds x6, x6, x4\n\t" "adcs x7, x7, xzr\n\t" + "and x9, x9, #0x7fffffffffffffff\n\t" "adcs x8, x8, xzr\n\t" "adc x9, x9, xzr\n\t" /* Add */ "adds x10, x10, x6\n\t" "adcs x11, x11, x7\n\t" "adcs x12, x12, x8\n\t" - "adc x13, x13, x9\n\t" - "mov x3, #-19\n\t" - "asr %x[a], x13, #63\n\t" + "adcs x13, x13, x9\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x3, x5, x3\n\t" /* Sub modulus (if overflow) */ - "subs x10, x10, x3\n\t" - "sbcs x11, x11, %x[a]\n\t" - "sbcs x12, x12, %x[a]\n\t" - "sbc x13, x13, x4\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, xzr\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" /* Multiply */ - /* A[0] * B[0] */ - "mul x6, x14, x10\n\t" + /* A[0] * B[0] */ "umulh x7, x14, x10\n\t" - /* A[0] * B[1] */ - "mul x3, x14, x11\n\t" - "umulh x8, x14, x11\n\t" - "adds x7, x7, x3\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[0] */ + "mul x6, x14, x10\n\t" + /* A[2] * B[0] */ + "umulh x9, x16, x10\n\t" + "mul x8, x16, x10\n\t" + /* A[1] * B[0] */ "mul x3, x15, x10\n\t" + "adds x7, x7, x3\n\t" "umulh x4, x15, x10\n\t" + "adcs x8, x8, x4\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[3] */ + "umulh x26, x15, x13\n\t" + "mul x25, x15, x13\n\t" + /* A[0] * B[1] */ + "mul x3, x14, x11\n\t" "adds x7, x7, x3\n\t" + "umulh x4, x14, x11\n\t" "adcs x8, x8, x4\n\t" - "adc x9, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x14, x12\n\t" - "umulh x4, x14, x12\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x15, x11\n\t" - "umulh x4, x15, x11\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc %x[a], xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x16, x10\n\t" - "umulh x4, x16, x10\n\t" - "adds x8, x8, x3\n\t" - "adcs x9, x9, x4\n\t" - "adc %x[a], %x[a], xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x14, x13\n\t" - "umulh x4, x14, x13\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x15, x12\n\t" - "umulh x4, x15, x12\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[2] * B[1] */ + /* A[2] * B[1] */ "mul x3, x16, x11\n\t" + "adcs x9, x9, x3\n\t" "umulh x4, x16, x11\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" + "adcs x25, x25, x4\n\t" "adc x26, x26, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x17, x10\n\t" - "umulh x4, x17, x10\n\t" + /* A[1] * B[2] */ + "mul x3, x15, x12\n\t" "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x15, x13\n\t" - "umulh x4, x15, x13\n\t" - "adds %x[a], %x[a], x3\n\t" - "adcs x26, x26, x4\n\t" + "umulh x4, x15, x12\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" "adc x27, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x16, x12\n\t" - "umulh x4, x16, x12\n\t" - "adds %x[a], %x[a], x3\n\t" - "adcs x26, x26, x4\n\t" + /* A[0] * B[2] */ + "mul x3, x14, x12\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x12\n\t" + "adcs x9, x9, x4\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" "adc x27, x27, xzr\n\t" - /* A[3] * B[1] */ + /* A[1] * B[1] */ + "mul x3, x15, x11\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x15, x11\n\t" + "adcs x9, x9, x4\n\t" + /* A[3] * B[1] */ "mul x3, x17, x11\n\t" + "adcs x25, x25, x3\n\t" "umulh x4, x17, x11\n\t" - "adds %x[a], %x[a], x3\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" - /* A[2] * B[3] */ + /* A[2] * B[2] */ + "mul x3, x16, x12\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x16, x12\n\t" + "adcs x26, x26, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x17, x13\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x17, x13\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x14, x13\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x14, x13\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ "mul x3, x16, x13\n\t" + "adcs x26, x26, x3\n\t" "umulh x4, x16, x13\n\t" - "adds x26, x26, x3\n\t" "adcs x27, x27, x4\n\t" - "adc x28, xzr, xzr\n\t" - /* A[3] * B[2] */ + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x17, x10\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x17, x10\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ "mul x3, x17, x12\n\t" + "adcs x26, x26, x3\n\t" "umulh x4, x17, x12\n\t" - "adds x26, x26, x3\n\t" "adcs x27, x27, x4\n\t" "adc x28, x28, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x17, x13\n\t" - "umulh x4, x17, x13\n\t" - "adds x27, x27, x3\n\t" - "adc x28, x28, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x28, x28, x27, #63\n\t" - "extr x27, x27, x26, #63\n\t" - "extr x26, x26, %x[a], #63\n\t" - "extr %x[a], %x[a], x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, %x[a]\n\t" - "umulh %x[a], x3, %x[a]\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x26\n\t" - "umulh x26, x3, x26\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x27\n\t" - "umulh x27, x3, x27\n\t" - "adcs x8, x8, x4\n\t" + "mov x3, #38\n\t" "mul x4, x3, x28\n\t" + "adds x9, x9, x4\n\t" "umulh x5, x3, x28\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, %x[a]\n\t" - "adcs x8, x8, x26\n\t" - "adcs x9, x9, x27\n\t" "adc x5, x5, xzr\n\t" - /* Overflow */ + "mov x3, #19\n\t" "extr x5, x5, x9, #63\n\t" "mul x5, x5, x3\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x6, x6, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "umulh x27, x3, x27\n\t" "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x9, asr 63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" + "adcs x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, x27\n\t" /* Store */ "stp x6, x7, [x29, #16]\n\t" "stp x8, x9, [x29, #32]\n\t" /* Add */ - "ldp x6, x7, [x29, #112]\n\t" - "ldp x8, x9, [x29, #128]\n\t" - "adds x10, x6, x19\n\t" - "adcs x11, x7, x20\n\t" - "adcs x12, x8, x21\n\t" - "adc x13, x9, x22\n\t" - "mov x3, #-19\n\t" - "asr %x[a], x13, #63\n\t" - /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "ldp x25, x26, [x29, #48]\n\t" + "ldp x27, x28, [x29, #64]\n\t" + "adds x10, x25, x19\n\t" + "adcs x11, x26, x20\n\t" + "adcs x12, x27, x21\n\t" + "adcs x13, x28, x22\n\t" + "cset x5, cs\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x3, x5, x3\n\t" /* Sub modulus (if overflow) */ - "subs x10, x10, x3\n\t" - "sbcs x11, x11, %x[a]\n\t" - "sbcs x12, x12, %x[a]\n\t" - "sbc x13, x13, x4\n\t" + "adds x10, x10, x3\n\t" + "adcs x11, x11, xzr\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "adcs x12, x12, xzr\n\t" + "adc x13, x13, xzr\n\t" /* Sub */ - "subs x19, x6, x19\n\t" - "sbcs x20, x7, x20\n\t" - "sbcs x21, x8, x21\n\t" - "sbcs x22, x9, x22\n\t" + "subs x19, x25, x19\n\t" + "sbcs x20, x26, x20\n\t" + "sbcs x21, x27, x21\n\t" + "sbcs x22, x28, x22\n\t" + "csetm x5, cc\n\t" "mov x3, #-19\n\t" - "csetm %x[a], cc\n\t" - /* Mask the modulus */ - "and x3, %x[a], x3\n\t" - "and x4, %x[a], #0x7fffffffffffffff\n\t" + "extr x5, x5, x22, #63\n\t" + "mul x3, x5, x3\n\t" /* Add modulus (if underflow) */ - "adds x19, x19, x3\n\t" - "adcs x20, x20, %x[a]\n\t" - "adcs x21, x21, %x[a]\n\t" - "adc x22, x22, x4\n\t" + "subs x19, x19, x3\n\t" + "sbcs x20, x20, xzr\n\t" + "and x22, x22, #0x7fffffffffffffff\n\t" + "sbcs x21, x21, xzr\n\t" + "sbc x22, x22, xzr\n\t" /* Square */ /* A[0] * A[1] */ - "mul x7, x10, x11\n\t" "umulh x8, x10, x11\n\t" + "mul x7, x10, x11\n\t" + /* A[0] * A[3] */ + "umulh x25, x10, x13\n\t" + "mul x9, x10, x13\n\t" /* A[0] * A[2] */ "mul x3, x10, x12\n\t" - "umulh x9, x10, x12\n\t" "adds x8, x8, x3\n\t" - "adc x9, x9, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, x10, x13\n\t" - "umulh %x[a], x10, x13\n\t" - "adds x9, x9, x3\n\t" - "adc %x[a], %x[a], xzr\n\t" + "umulh x4, x10, x12\n\t" + "adcs x9, x9, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x11, x13\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x11, x13\n\t" + "adc x26, x26, xzr\n\t" /* A[1] * A[2] */ "mul x3, x11, x12\n\t" - "umulh x4, x11, x12\n\t" "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x11, x13\n\t" - "umulh x4, x11, x13\n\t" - "adds %x[a], %x[a], x3\n\t" - "adc x26, x26, x4\n\t" + "umulh x4, x11, x12\n\t" + "adcs x25, x25, x4\n\t" /* A[2] * A[3] */ "mul x3, x12, x13\n\t" + "adcs x26, x26, x3\n\t" "umulh x27, x12, x13\n\t" - "adds x26, x26, x3\n\t" "adc x27, x27, xzr\n\t" /* Double */ "adds x7, x7, x7\n\t" "adcs x8, x8, x8\n\t" "adcs x9, x9, x9\n\t" - "adcs %x[a], %x[a], %x[a]\n\t" + "adcs x25, x25, x25\n\t" "adcs x26, x26, x26\n\t" "adcs x27, x27, x27\n\t" "adc x28, xzr, xzr\n\t" /* A[0] * A[0] */ + "umulh x4, x10, x10\n\t" "mul x6, x10, x10\n\t" - "umulh x5, x10, x10\n\t" /* A[1] * A[1] */ "mul x3, x11, x11\n\t" + "adds x7, x7, x4\n\t" "umulh x4, x11, x11\n\t" - "adds x7, x7, x5\n\t" "adcs x8, x8, x3\n\t" - "adc x5, x4, xzr\n\t" /* A[2] * A[2] */ "mul x3, x12, x12\n\t" + "adcs x9, x9, x4\n\t" "umulh x4, x12, x12\n\t" - "adds x9, x9, x5\n\t" - "adcs %x[a], %x[a], x3\n\t" - "adc x5, x4, xzr\n\t" + "adcs x25, x25, x3\n\t" /* A[3] * A[3] */ "mul x3, x13, x13\n\t" + "adcs x26, x26, x4\n\t" "umulh x4, x13, x13\n\t" - "adds x26, x26, x5\n\t" "adcs x27, x27, x3\n\t" "adc x28, x28, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x28, x28, x27, #63\n\t" - "extr x27, x27, x26, #63\n\t" - "extr x26, x26, %x[a], #63\n\t" - "extr %x[a], %x[a], x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, %x[a]\n\t" - "umulh %x[a], x3, %x[a]\n\t" - "adds x6, x6, x4\n\t" - "mul x4, x3, x26\n\t" - "umulh x26, x3, x26\n\t" - "adcs x7, x7, x4\n\t" - "mul x4, x3, x27\n\t" - "umulh x27, x3, x27\n\t" - "adcs x8, x8, x4\n\t" + "mov x3, #38\n\t" "mul x4, x3, x28\n\t" + "adds x9, x9, x4\n\t" "umulh x5, x3, x28\n\t" - "adcs x9, x9, x4\n\t" "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, %x[a]\n\t" - "adcs x8, x8, x26\n\t" - "adcs x9, x9, x27\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ + "mov x3, #19\n\t" "extr x5, x5, x9, #63\n\t" "mul x5, x5, x3\n\t" "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x6, x6, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x7, x7, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x8, x8, x4\n\t" + "umulh x27, x3, x27\n\t" "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x9, asr 63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "stp x6, x7, [x29, #80]\n\t" - "stp x8, x9, [x29, #96]\n\t" + "adcs x7, x7, x25\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, x27\n\t" /* Square */ /* A[0] * A[1] */ - "mul x7, x19, x20\n\t" - "umulh x8, x19, x20\n\t" + "umulh x16, x19, x20\n\t" + "mul x15, x19, x20\n\t" + /* A[0] * A[3] */ + "umulh x25, x19, x22\n\t" + "mul x17, x19, x22\n\t" /* A[0] * A[2] */ "mul x3, x19, x21\n\t" - "umulh x9, x19, x21\n\t" - "adds x8, x8, x3\n\t" - "adc x9, x9, xzr\n\t" - /* A[0] * A[3] */ - "mul x3, x19, x22\n\t" - "umulh %x[a], x19, x22\n\t" - "adds x9, x9, x3\n\t" - "adc %x[a], %x[a], xzr\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x19, x21\n\t" + "adcs x17, x17, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x20, x22\n\t" + "adcs x25, x25, x3\n\t" + "umulh x26, x20, x22\n\t" + "adc x26, x26, xzr\n\t" /* A[1] * A[2] */ "mul x3, x20, x21\n\t" + "adds x17, x17, x3\n\t" "umulh x4, x20, x21\n\t" - "adds x9, x9, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x3, x20, x22\n\t" - "umulh x4, x20, x22\n\t" - "adds %x[a], %x[a], x3\n\t" - "adc x26, x26, x4\n\t" + "adcs x25, x25, x4\n\t" /* A[2] * A[3] */ "mul x3, x21, x22\n\t" + "adcs x26, x26, x3\n\t" "umulh x27, x21, x22\n\t" - "adds x26, x26, x3\n\t" "adc x27, x27, xzr\n\t" /* Double */ - "adds x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs %x[a], %x[a], %x[a]\n\t" + "adds x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adcs x17, x17, x17\n\t" + "adcs x25, x25, x25\n\t" "adcs x26, x26, x26\n\t" "adcs x27, x27, x27\n\t" "adc x28, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x6, x19, x19\n\t" - "umulh x5, x19, x19\n\t" + "umulh x4, x19, x19\n\t" + "mul x14, x19, x19\n\t" /* A[1] * A[1] */ "mul x3, x20, x20\n\t" + "adds x15, x15, x4\n\t" "umulh x4, x20, x20\n\t" - "adds x7, x7, x5\n\t" - "adcs x8, x8, x3\n\t" - "adc x5, x4, xzr\n\t" + "adcs x16, x16, x3\n\t" /* A[2] * A[2] */ "mul x3, x21, x21\n\t" + "adcs x17, x17, x4\n\t" "umulh x4, x21, x21\n\t" - "adds x9, x9, x5\n\t" - "adcs %x[a], %x[a], x3\n\t" - "adc x5, x4, xzr\n\t" + "adcs x25, x25, x3\n\t" /* A[3] * A[3] */ "mul x3, x22, x22\n\t" + "adcs x26, x26, x4\n\t" "umulh x4, x22, x22\n\t" - "adds x26, x26, x5\n\t" "adcs x27, x27, x3\n\t" "adc x28, x28, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x28, x28, x27, #63\n\t" - "extr x27, x27, x26, #63\n\t" - "extr x26, x26, %x[a], #63\n\t" - "extr %x[a], %x[a], x9, #63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x3, #38\n\t" + "mul x4, x3, x28\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x28\n\t" + "adc x5, x5, xzr\n\t" "mov x3, #19\n\t" - "mul x4, x3, %x[a]\n\t" - "umulh %x[a], x3, %x[a]\n\t" - "adds x6, x6, x4\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x14, x14, x4\n\t" + "umulh x25, x3, x25\n\t" "mul x4, x3, x26\n\t" + "adcs x15, x15, x4\n\t" "umulh x26, x3, x26\n\t" - "adcs x7, x7, x4\n\t" "mul x4, x3, x27\n\t" + "adcs x16, x16, x4\n\t" "umulh x27, x3, x27\n\t" - "adcs x8, x8, x4\n\t" - "mul x4, x3, x28\n\t" - "umulh x5, x3, x28\n\t" - "adcs x9, x9, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x7, x7, %x[a]\n\t" - "adcs x8, x8, x26\n\t" - "adcs x9, x9, x27\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x9, #63\n\t" - "mul x5, x5, x3\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x9, asr 63\n\t" - "and x9, x9, #0x7fffffffffffffff\n\t" - "adds x6, x6, x5\n\t" - "adcs x7, x7, xzr\n\t" - "adcs x8, x8, xzr\n\t" - "adc x9, x9, xzr\n\t" - /* Store */ - "ldr %x[a], [x29, #184]\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x25\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, x27\n\t" /* Multiply */ - "ldp x14, x15, [%x[a]]\n\t" - "ldp x16, x17, [%x[a], #16]\n\t" - /* A[0] * B[0] */ - "mul x10, x14, x6\n\t" - "umulh x11, x14, x6\n\t" - /* A[0] * B[1] */ - "mul x3, x14, x7\n\t" - "umulh x12, x14, x7\n\t" + "ldp x19, x20, [%x[a]]\n\t" + "ldp x21, x22, [%x[a], #16]\n\t" + /* A[0] * B[0] */ + "umulh x11, x19, x14\n\t" + "mul x10, x19, x14\n\t" + /* A[2] * B[0] */ + "umulh x13, x21, x14\n\t" + "mul x12, x21, x14\n\t" + /* A[1] * B[0] */ + "mul x3, x20, x14\n\t" "adds x11, x11, x3\n\t" - "adc x12, x12, xzr\n\t" - /* A[1] * B[0] */ - "mul x3, x15, x6\n\t" - "umulh x4, x15, x6\n\t" + "umulh x4, x20, x14\n\t" + "adcs x12, x12, x4\n\t" + "adc x13, x13, xzr\n\t" + /* A[1] * B[3] */ + "umulh x26, x20, x17\n\t" + "mul x25, x20, x17\n\t" + /* A[0] * B[1] */ + "mul x3, x19, x15\n\t" "adds x11, x11, x3\n\t" + "umulh x4, x19, x15\n\t" "adcs x12, x12, x4\n\t" - "adc x13, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x14, x8\n\t" - "umulh x4, x14, x8\n\t" - "adds x12, x12, x3\n\t" - "adc x13, x13, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x15, x7\n\t" - "umulh x4, x15, x7\n\t" + /* A[2] * B[1] */ + "mul x3, x21, x15\n\t" + "adcs x13, x13, x3\n\t" + "umulh x4, x21, x15\n\t" + "adcs x25, x25, x4\n\t" + "adc x26, x26, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x20, x16\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x20, x16\n\t" + "adcs x25, x25, x4\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x19, x16\n\t" "adds x12, x12, x3\n\t" + "umulh x4, x19, x16\n\t" "adcs x13, x13, x4\n\t" - "adc %x[a], xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x16, x6\n\t" - "umulh x4, x16, x6\n\t" + "adcs x25, x25, xzr\n\t" + "adcs x26, x26, xzr\n\t" + "adc x27, x27, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x20, x15\n\t" "adds x12, x12, x3\n\t" + "umulh x4, x20, x15\n\t" "adcs x13, x13, x4\n\t" - "adc %x[a], %x[a], xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x14, x9\n\t" - "umulh x4, x14, x9\n\t" - "adds x13, x13, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x15, x8\n\t" - "umulh x4, x15, x8\n\t" - "adds x13, x13, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[2] * B[1] */ - "mul x3, x16, x7\n\t" - "umulh x4, x16, x7\n\t" - "adds x13, x13, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x17, x6\n\t" - "umulh x4, x17, x6\n\t" - "adds x13, x13, x3\n\t" - "adcs %x[a], %x[a], x4\n\t" - "adc x26, x26, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x15, x9\n\t" - "umulh x4, x15, x9\n\t" - "adds %x[a], %x[a], x3\n\t" - "adcs x26, x26, x4\n\t" - "adc x27, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x16, x8\n\t" - "umulh x4, x16, x8\n\t" - "adds %x[a], %x[a], x3\n\t" + /* A[3] * B[1] */ + "mul x3, x22, x15\n\t" + "adcs x25, x25, x3\n\t" + "umulh x4, x22, x15\n\t" "adcs x26, x26, x4\n\t" "adc x27, x27, xzr\n\t" - /* A[3] * B[1] */ - "mul x3, x17, x7\n\t" - "umulh x4, x17, x7\n\t" - "adds %x[a], %x[a], x3\n\t" + /* A[2] * B[2] */ + "mul x3, x21, x16\n\t" + "adds x25, x25, x3\n\t" + "umulh x4, x21, x16\n\t" "adcs x26, x26, x4\n\t" - "adc x27, x27, xzr\n\t" - /* A[2] * B[3] */ - "mul x3, x16, x9\n\t" - "umulh x4, x16, x9\n\t" - "adds x26, x26, x3\n\t" + /* A[3] * B[3] */ + "mul x3, x22, x17\n\t" + "adcs x27, x27, x3\n\t" + "umulh x28, x22, x17\n\t" + "adc x28, x28, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x19, x17\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x19, x17\n\t" + "adcs x25, x25, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x21, x17\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x21, x17\n\t" "adcs x27, x27, x4\n\t" - "adc x28, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x3, x17, x8\n\t" - "umulh x4, x17, x8\n\t" - "adds x26, x26, x3\n\t" + "adc x28, x28, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x22, x14\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x22, x14\n\t" + "adcs x25, x25, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x22, x16\n\t" + "adcs x26, x26, x3\n\t" + "umulh x4, x22, x16\n\t" "adcs x27, x27, x4\n\t" "adc x28, x28, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x17, x9\n\t" - "umulh x4, x17, x9\n\t" - "adds x27, x27, x3\n\t" - "adc x28, x28, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x28, x28, x27, #63\n\t" - "extr x27, x27, x26, #63\n\t" - "extr x26, x26, %x[a], #63\n\t" - "extr %x[a], %x[a], x13, #63\n\t" - "and x13, x13, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x3, #19\n\t" - "mul x4, x3, %x[a]\n\t" - "umulh %x[a], x3, %x[a]\n\t" - "adds x10, x10, x4\n\t" - "mul x4, x3, x26\n\t" - "umulh x26, x3, x26\n\t" - "adcs x11, x11, x4\n\t" - "mul x4, x3, x27\n\t" - "umulh x27, x3, x27\n\t" - "adcs x12, x12, x4\n\t" + "mov x3, #38\n\t" "mul x4, x3, x28\n\t" + "adds x13, x13, x4\n\t" "umulh x5, x3, x28\n\t" - "adcs x13, x13, x4\n\t" "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x11, x11, %x[a]\n\t" - "adcs x12, x12, x26\n\t" - "adcs x13, x13, x27\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ + "mov x3, #19\n\t" "extr x5, x5, x13, #63\n\t" "mul x5, x5, x3\n\t" "and x13, x13, #0x7fffffffffffffff\n\t" - "adds x10, x10, x5\n\t" - "adcs x11, x11, xzr\n\t" - "adcs x12, x12, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x25\n\t" + "adds x10, x10, x4\n\t" + "umulh x25, x3, x25\n\t" + "mul x4, x3, x26\n\t" + "adcs x11, x11, x4\n\t" + "umulh x26, x3, x26\n\t" + "mul x4, x3, x27\n\t" + "adcs x12, x12, x4\n\t" + "umulh x27, x3, x27\n\t" "adc x13, x13, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x13, asr 63\n\t" - "and x13, x13, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x10, x10, x5\n\t" - "adcs x11, x11, xzr\n\t" - "adcs x12, x12, xzr\n\t" - "adc x13, x13, xzr\n\t" - /* Store */ - "stp x10, x11, [x29, #48]\n\t" - "stp x12, x13, [x29, #64]\n\t" - "sub x25, x25, #1\n\t" - "cmp x25, #0\n\t" + "adcs x11, x11, x25\n\t" + "adcs x12, x12, x26\n\t" + "adc x13, x13, x27\n\t" + "subs x24, x24, #1\n\t" "bge L_curve25519_bits_%=\n\t" - "mov x25, #63\n\t" - "sub x24, x24, #8\n\t" - "cmp x24, #0\n\t" - "bge L_curve25519_words_%=\n\t" /* Invert */ "add x0, x29, #48\n\t" "add x1, x29, #16\n\t" @@ -2407,379 +2736,930 @@ int curve25519(byte* r, const byte* n, const byte* a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x70\n\t" -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x24, #3\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x70\n\t" + /* Loop: 5 times */ + "mov x24, #5\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" "\n" "L_curve25519_inv_1_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_1_%=\n\t" - "add x0, x29, #0x50\n\t" + "bne L_curve25519_inv_1_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" #ifndef NDEBUG - "add x1, x29, #0x70\n\t" + "add x0, x29, #0x50\n\t" #endif /* !NDEBUG */ + "add x1, x29, #0x70\n\t" "add x2, x29, #0x50\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x70\n\t" - "add x1, x29, #0x50\n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x24, #8\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x70\n\t" + /* Loop: 10 times */ + "mov x24, #10\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" "\n" "L_curve25519_inv_2_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_2_%=\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ -#ifndef NDEBUG - "add x1, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #0x50\n\t" -#ifndef __APPLE__ - "bl fe_mul\n\t" -#else - "bl _fe_mul\n\t" -#endif /* __APPLE__ */ - "add x0, x29, #0x90\n\t" -#ifndef NDEBUG - "add x1, x29, #0x70\n\t" -#endif /* !NDEBUG */ -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x24, #18\n\t" -#ifndef NDEBUG - "add x0, x29, #0x90\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x90\n\t" - "\n" - "L_curve25519_inv_3_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_3_%=\n\t" + "bne L_curve25519_inv_2_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" "add x0, x29, #0x70\n\t" #ifndef NDEBUG - "add x1, x29, #0x90\n\t" + "add x1, x29, #0x70\n\t" #endif /* !NDEBUG */ - "add x2, x29, #0x70\n\t" + "add x2, x29, #0x50\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x24, #9\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x70\n\t" + /* Loop: 20 times */ + "mov x24, #20\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" "\n" - "L_curve25519_inv_4_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_curve25519_inv_3_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_4_%=\n\t" - "add x0, x29, #0x50\n\t" + "bne L_curve25519_inv_3_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #144]\n\t" + "stp x8, x9, [x29, #160]\n\t" #ifndef NDEBUG - "add x1, x29, #0x70\n\t" + "add x0, x29, #0x70\n\t" #endif /* !NDEBUG */ - "add x2, x29, #0x50\n\t" + "add x1, x29, #0x90\n\t" + "add x2, x29, #0x70\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x70\n\t" - "add x1, x29, #0x50\n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x24, #48\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x70\n\t" + /* Loop: 10 times */ + "mov x24, #10\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" "\n" - "L_curve25519_inv_5_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_curve25519_inv_4_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_5_%=\n\t" -#ifndef NDEBUG - "add x0, x29, #0x70\n\t" -#endif /* !NDEBUG */ -#ifndef NDEBUG + "bne L_curve25519_inv_4_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" + "add x0, x29, #0x50\n\t" "add x1, x29, #0x70\n\t" -#endif /* !NDEBUG */ "add x2, x29, #0x50\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x90\n\t" -#ifndef NDEBUG - "add x1, x29, #0x70\n\t" -#endif /* !NDEBUG */ -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x24, #0x62\n\t" -#ifndef NDEBUG - "add x0, x29, #0x90\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x90\n\t" + /* Loop: 50 times */ + "mov x24, #50\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" "\n" - "L_curve25519_inv_6_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_curve25519_inv_5_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_6_%=\n\t" + "bne L_curve25519_inv_5_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" "add x0, x29, #0x70\n\t" #ifndef NDEBUG - "add x1, x29, #0x90\n\t" + "add x1, x29, #0x70\n\t" #endif /* !NDEBUG */ - "add x2, x29, #0x70\n\t" + "add x2, x29, #0x50\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x24, #49\n\t" + /* Loop: 100 times */ + "mov x24, #0x64\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "\n" + "L_curve25519_inv_6_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "bne L_curve25519_inv_6_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #144]\n\t" + "stp x8, x9, [x29, #160]\n\t" #ifndef NDEBUG "add x0, x29, #0x70\n\t" #endif /* !NDEBUG */ - "add x1, x29, #0x70\n\t" - "\n" - "L_curve25519_inv_7_%=: \n\t" + "add x1, x29, #0x90\n\t" + "add x2, x29, #0x70\n\t" #ifndef __APPLE__ - "bl fe_sq\n\t" + "bl fe_mul\n\t" #else - "bl _fe_sq\n\t" + "bl _fe_mul\n\t" #endif /* __APPLE__ */ + /* Loop: 50 times */ + "mov x24, #50\n\t" + "ldp x6, x7, [x29, #112]\n\t" + "ldp x8, x9, [x29, #128]\n\t" + "\n" + "L_curve25519_inv_7_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_7_%=\n\t" + "bne L_curve25519_inv_7_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #112]\n\t" + "stp x8, x9, [x29, #128]\n\t" "add x0, x29, #0x50\n\t" -#ifndef NDEBUG "add x1, x29, #0x70\n\t" -#endif /* !NDEBUG */ "add x2, x29, #0x50\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x24, #4\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" + /* Loop: 5 times */ + "mov x24, #5\n\t" + "ldp x6, x7, [x29, #80]\n\t" + "ldp x8, x9, [x29, #96]\n\t" "\n" "L_curve25519_inv_8_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "subs x24, x24, #1\n\t" - "bcs L_curve25519_inv_8_%=\n\t" - "add x0, x29, #16\n\t" -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #48\n\t" -#ifndef __APPLE__ - "bl fe_mul\n\t" -#else - "bl _fe_mul\n\t" -#endif /* __APPLE__ */ - "ldr %x[r], [x29, #176]\n\t" - /* Multiply */ - "ldp x6, x7, [%x[r]]\n\t" - "ldp x8, x9, [%x[r], #16]\n\t" - "ldp x10, x11, [x29, #16]\n\t" - "ldp x12, x13, [x29, #32]\n\t" - /* A[0] * B[0] */ - "mul x14, x6, x10\n\t" - "umulh x15, x6, x10\n\t" - /* A[0] * B[1] */ - "mul x3, x6, x11\n\t" - "umulh x16, x6, x11\n\t" - "adds x15, x15, x3\n\t" - "adc x16, x16, xzr\n\t" - /* A[1] * B[0] */ - "mul x3, x7, x10\n\t" - "umulh x4, x7, x10\n\t" - "adds x15, x15, x3\n\t" - "adcs x16, x16, x4\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" "adc x17, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x3, x6, x12\n\t" - "umulh x4, x6, x12\n\t" - "adds x16, x16, x3\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" "adc x17, x17, x4\n\t" - /* A[1] * B[1] */ - "mul x3, x7, x11\n\t" - "umulh x4, x7, x11\n\t" - "adds x16, x16, x3\n\t" - "adcs x17, x17, x4\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x3, x8, x10\n\t" - "umulh x4, x8, x10\n\t" - "adds x16, x16, x3\n\t" - "adcs x17, x17, x4\n\t" - "adc x19, x19, xzr\n\t" - /* A[0] * B[3] */ - "mul x3, x6, x13\n\t" - "umulh x4, x6, x13\n\t" - "adds x17, x17, x3\n\t" - "adcs x19, x19, x4\n\t" - "adc x20, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x3, x7, x12\n\t" - "umulh x4, x7, x12\n\t" - "adds x17, x17, x3\n\t" - "adcs x19, x19, x4\n\t" - "adc x20, x20, xzr\n\t" - /* A[2] * B[1] */ + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x24, x24, #1\n\t" + "bne L_curve25519_inv_8_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #48\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + "ldr %x[r], [x29, #176]\n\t" + /* Multiply */ + "ldp x6, x7, [%x[r]]\n\t" + "ldp x8, x9, [%x[r], #16]\n\t" + "ldp x10, x11, [x29, #16]\n\t" + "ldp x12, x13, [x29, #32]\n\t" + /* A[0] * B[0] */ + "umulh x15, x6, x10\n\t" + "mul x14, x6, x10\n\t" + /* A[2] * B[0] */ + "umulh x17, x8, x10\n\t" + "mul x16, x8, x10\n\t" + /* A[1] * B[0] */ + "mul x3, x7, x10\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x7, x10\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x7, x13\n\t" + "mul x19, x7, x13\n\t" + /* A[0] * B[1] */ + "mul x3, x6, x11\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x6, x11\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ "mul x3, x8, x11\n\t" + "adcs x17, x17, x3\n\t" "umulh x4, x8, x11\n\t" - "adds x17, x17, x3\n\t" "adcs x19, x19, x4\n\t" "adc x20, x20, xzr\n\t" - /* A[3] * B[0] */ - "mul x3, x9, x10\n\t" - "umulh x4, x9, x10\n\t" + /* A[1] * B[2] */ + "mul x3, x7, x12\n\t" "adds x17, x17, x3\n\t" + "umulh x4, x7, x12\n\t" "adcs x19, x19, x4\n\t" - "adc x20, x20, xzr\n\t" - /* A[1] * B[3] */ - "mul x3, x7, x13\n\t" - "umulh x4, x7, x13\n\t" - "adds x19, x19, x3\n\t" - "adcs x20, x20, x4\n\t" + "adcs x20, x20, xzr\n\t" "adc x21, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x3, x8, x12\n\t" - "umulh x4, x8, x12\n\t" - "adds x19, x19, x3\n\t" - "adcs x20, x20, x4\n\t" + /* A[0] * B[2] */ + "mul x3, x6, x12\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x6, x12\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" "adc x21, x21, xzr\n\t" - /* A[3] * B[1] */ + /* A[1] * B[1] */ + "mul x3, x7, x11\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x7, x11\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ "mul x3, x9, x11\n\t" + "adcs x19, x19, x3\n\t" "umulh x4, x9, x11\n\t" - "adds x19, x19, x3\n\t" "adcs x20, x20, x4\n\t" "adc x21, x21, xzr\n\t" - /* A[2] * B[3] */ + /* A[2] * B[2] */ + "mul x3, x8, x12\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x8, x12\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x9, x13\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x9, x13\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x6, x13\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x6, x13\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ "mul x3, x8, x13\n\t" + "adcs x20, x20, x3\n\t" "umulh x4, x8, x13\n\t" - "adds x20, x20, x3\n\t" "adcs x21, x21, x4\n\t" - "adc x22, xzr, xzr\n\t" - /* A[3] * B[2] */ + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x9, x10\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x9, x10\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ "mul x3, x9, x12\n\t" + "adcs x20, x20, x3\n\t" "umulh x4, x9, x12\n\t" - "adds x20, x20, x3\n\t" "adcs x21, x21, x4\n\t" "adc x22, x22, xzr\n\t" - /* A[3] * B[3] */ - "mul x3, x9, x13\n\t" - "umulh x4, x9, x13\n\t" - "adds x21, x21, x3\n\t" - "adc x22, x22, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x22, x22, x21, #63\n\t" - "extr x21, x21, x20, #63\n\t" - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "and x17, x17, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" "mul x4, x3, x19\n\t" - "umulh x19, x3, x19\n\t" "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" "mul x4, x3, x20\n\t" - "umulh x20, x3, x20\n\t" "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" "mul x4, x3, x21\n\t" - "umulh x21, x3, x21\n\t" "adcs x16, x16, x4\n\t" - "mul x4, x3, x22\n\t" - "umulh x5, x3, x22\n\t" - "adcs x17, x17, x4\n\t" - "adc x5, x5, xzr\n\t" - /* Add remaining product results in */ - "adds x15, x15, x19\n\t" - "adcs x16, x16, x20\n\t" - "adcs x17, x17, x21\n\t" - "adc x5, x5, xzr\n\t" - /* Overflow */ - "extr x5, x5, x17, #63\n\t" - "mul x5, x5, x3\n\t" - "and x17, x17, #0x7fffffffffffffff\n\t" - "adds x14, x14, x5\n\t" - "adcs x15, x15, xzr\n\t" - "adcs x16, x16, xzr\n\t" + "umulh x21, x3, x21\n\t" "adc x17, x17, xzr\n\t" - /* Reduce if top bit set */ - "and x5, x3, x17, asr 63\n\t" - "and x17, x17, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" + /* Reduce if top bit set */ + "mov x3, #19\n\t" + "and x4, x3, x17, asr 63\n\t" + "adds x14, x14, x4\n\t" "adcs x15, x15, xzr\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" "adcs x16, x16, xzr\n\t" "adc x17, x17, xzr\n\t" "adds x4, x14, x3\n\t" @@ -2805,6 +3685,7 @@ int curve25519(byte* r, const byte* n, const byte* a) return (uint32_t)(size_t)r; } +#ifdef HAVE_ED25519 void fe_pow22523(fe r, const fe a) { __asm__ __volatile__ ( @@ -2877,118 +3758,100 @@ void fe_pow22523(fe r, const fe a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #48\n\t" - "add x1, x29, #16\n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x23, #3\n\t" -#ifndef NDEBUG - "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #48\n\t" + /* Loop: 5 times */ + "mov x23, #5\n\t" + "ldp x6, x7, [x29, #16]\n\t" + "ldp x8, x9, [x29, #32]\n\t" "\n" "L_fe_pow22523_1_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_1_%=\n\t" - "add x0, x29, #16\n\t" -#ifndef NDEBUG - "add x1, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #16\n\t" -#ifndef __APPLE__ - "bl fe_mul\n\t" -#else - "bl _fe_mul\n\t" -#endif /* __APPLE__ */ - "add x0, x29, #48\n\t" - "add x1, x29, #16\n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x23, #8\n\t" -#ifndef NDEBUG - "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #48\n\t" - "\n" - "L_fe_pow22523_2_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_2_%=\n\t" -#ifndef NDEBUG - "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ -#ifndef NDEBUG - "add x1, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #16\n\t" -#ifndef __APPLE__ - "bl fe_mul\n\t" -#else - "bl _fe_mul\n\t" -#endif /* __APPLE__ */ - "add x0, x29, #0x50\n\t" -#ifndef NDEBUG - "add x1, x29, #48\n\t" -#endif /* !NDEBUG */ -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x23, #18\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" - "\n" - "L_fe_pow22523_3_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_3_%=\n\t" - "add x0, x29, #48\n\t" -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #48\n\t" -#ifndef __APPLE__ - "bl fe_mul\n\t" -#else - "bl _fe_mul\n\t" -#endif /* __APPLE__ */ - "mov x23, #9\n\t" + "bne L_fe_pow22523_1_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" #ifndef NDEBUG - "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #48\n\t" - "\n" - "L_fe_pow22523_4_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_4_%=\n\t" "add x0, x29, #16\n\t" +#endif /* !NDEBUG */ #ifndef NDEBUG "add x1, x29, #48\n\t" #endif /* !NDEBUG */ @@ -2998,30 +3861,98 @@ void fe_pow22523(fe r, const fe a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #48\n\t" - "add x1, x29, #16\n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ - "mov x23, #48\n\t" -#ifndef NDEBUG - "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #48\n\t" + /* Loop: 10 times */ + "mov x23, #10\n\t" + "ldp x6, x7, [x29, #16]\n\t" + "ldp x8, x9, [x29, #32]\n\t" "\n" - "L_fe_pow22523_5_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_pow22523_2_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_5_%=\n\t" -#ifndef NDEBUG + "bne L_fe_pow22523_2_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ #ifndef NDEBUG "add x1, x29, #48\n\t" #endif /* !NDEBUG */ @@ -3031,54 +3962,298 @@ void fe_pow22523(fe r, const fe a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "add x0, x29, #0x50\n\t" + /* Loop: 20 times */ + "mov x23, #20\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" + "\n" + "L_fe_pow22523_3_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x23, x23, #1\n\t" + "bne L_fe_pow22523_3_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" #ifndef NDEBUG - "add x1, x29, #48\n\t" + "add x0, x29, #48\n\t" #endif /* !NDEBUG */ + "add x1, x29, #0x50\n\t" + "add x2, x29, #48\n\t" #ifndef __APPLE__ - "bl fe_sq\n\t" + "bl fe_mul\n\t" #else - "bl _fe_sq\n\t" + "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x23, #0x62\n\t" -#ifndef NDEBUG - "add x0, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #0x50\n\t" + /* Loop: 10 times */ + "mov x23, #10\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" "\n" - "L_fe_pow22523_6_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_pow22523_4_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_6_%=\n\t" - "add x0, x29, #48\n\t" -#ifndef NDEBUG - "add x1, x29, #0x50\n\t" -#endif /* !NDEBUG */ - "add x2, x29, #48\n\t" + "bne L_fe_pow22523_4_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #16\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x23, #49\n\t" -#ifndef NDEBUG - "add x0, x29, #48\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #48\n\t" + /* Loop: 50 times */ + "mov x23, #50\n\t" + "ldp x6, x7, [x29, #16]\n\t" + "ldp x8, x9, [x29, #32]\n\t" "\n" - "L_fe_pow22523_7_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_pow22523_5_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_7_%=\n\t" - "add x0, x29, #16\n\t" + "bne L_fe_pow22523_5_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" + "add x0, x29, #48\n\t" #ifndef NDEBUG "add x1, x29, #48\n\t" #endif /* !NDEBUG */ @@ -3088,2698 +4263,1995 @@ void fe_pow22523(fe r, const fe a) #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "mov x23, #1\n\t" -#ifndef NDEBUG - "add x0, x29, #16\n\t" -#endif /* !NDEBUG */ - "add x1, x29, #16\n\t" + /* Loop: 100 times */ + "mov x23, #0x64\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" "\n" - "L_fe_pow22523_8_%=: \n\t" -#ifndef __APPLE__ - "bl fe_sq\n\t" -#else - "bl _fe_sq\n\t" -#endif /* __APPLE__ */ + "L_fe_pow22523_6_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bcs L_fe_pow22523_8_%=\n\t" - "ldr x0, [x29, #112]\n\t" + "bne L_fe_pow22523_6_%=\n\t" + /* Store */ + "stp x6, x7, [x29, #80]\n\t" + "stp x8, x9, [x29, #96]\n\t" #ifndef NDEBUG - "add x1, x29, #16\n\t" + "add x0, x29, #48\n\t" #endif /* !NDEBUG */ - "ldr x2, [x29, #120]\n\t" + "add x1, x29, #0x50\n\t" + "add x2, x29, #48\n\t" #ifndef __APPLE__ "bl fe_mul\n\t" #else "bl _fe_mul\n\t" #endif /* __APPLE__ */ - "ldp x29, x30, [sp], #0x80\n\t" - : [r] "+r" (r), [a] "+r" (a) - : - : "memory", "x2", "x23" - ); -} - -void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-64]!\n\t" - "add x29, sp, #0\n\t" - "str %x[ry], [x29, #16]\n\t" - "str %x[rz], [x29, #24]\n\t" - "str %x[px], [x29, #32]\n\t" - "str %x[py], [x29, #40]\n\t" - "str %x[pz], [x29, #48]\n\t" - "str %x[pt], [x29, #56]\n\t" - "ldr x1, [x29, #32]\n\t" - "ldr x2, [x29, #56]\n\t" - /* Multiply */ - "ldp x11, x12, [x1]\n\t" - "ldp x13, x14, [x1, #16]\n\t" - "ldp x15, x16, [x2]\n\t" - "ldp x17, x19, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x11, x15\n\t" - "umulh x4, x11, x15\n\t" - /* A[0] * B[1] */ - "mul x20, x11, x16\n\t" - "umulh x5, x11, x16\n\t" - "adds x4, x4, x20\n\t" - "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x20, x12, x15\n\t" - "umulh x21, x12, x15\n\t" - "adds x4, x4, x20\n\t" - "adcs x5, x5, x21\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x20, x11, x17\n\t" - "umulh x21, x11, x17\n\t" - "adds x5, x5, x20\n\t" - "adc x6, x6, x21\n\t" - /* A[1] * B[1] */ - "mul x20, x12, x16\n\t" - "umulh x21, x12, x16\n\t" - "adds x5, x5, x20\n\t" - "adcs x6, x6, x21\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x20, x13, x15\n\t" - "umulh x21, x13, x15\n\t" - "adds x5, x5, x20\n\t" - "adcs x6, x6, x21\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x20, x11, x19\n\t" - "umulh x21, x11, x19\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x20, x12, x17\n\t" - "umulh x21, x12, x17\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x20, x13, x16\n\t" - "umulh x21, x13, x16\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x20, x14, x15\n\t" - "umulh x21, x14, x15\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x20, x12, x19\n\t" - "umulh x21, x12, x19\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x20, x13, x17\n\t" - "umulh x21, x13, x17\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x20, x14, x16\n\t" - "umulh x21, x14, x16\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x20, x13, x19\n\t" - "umulh x21, x13, x19\n\t" - "adds x8, x8, x20\n\t" - "adcs x9, x9, x21\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x20, x14, x17\n\t" - "umulh x21, x14, x17\n\t" - "adds x8, x8, x20\n\t" - "adcs x9, x9, x21\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x20, x14, x19\n\t" - "umulh x21, x14, x19\n\t" - "adds x9, x9, x20\n\t" - "adc x10, x10, x21\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x20, #19\n\t" - "mul x21, x20, x7\n\t" - "umulh x7, x20, x7\n\t" - "adds x3, x3, x21\n\t" - "mul x21, x20, x8\n\t" - "umulh x8, x20, x8\n\t" - "adcs x4, x4, x21\n\t" - "mul x21, x20, x9\n\t" - "umulh x9, x20, x9\n\t" - "adcs x5, x5, x21\n\t" - "mul x21, x20, x10\n\t" - "umulh x22, x20, x10\n\t" - "adcs x6, x6, x21\n\t" - "adc x22, x22, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x22, x22, xzr\n\t" - /* Overflow */ - "extr x22, x22, x6, #63\n\t" - "mul x22, x22, x20\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x22\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "and x22, x20, x6, asr 63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x22\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldr x0, [x29, #16]\n\t" - "ldr x1, [x29, #40]\n\t" - "ldr x2, [x29, #48]\n\t" - /* Multiply */ - "ldp x11, x12, [x1]\n\t" - "ldp x13, x14, [x1, #16]\n\t" - "ldp x15, x16, [x2]\n\t" - "ldp x17, x19, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x11, x15\n\t" - "umulh x4, x11, x15\n\t" - /* A[0] * B[1] */ - "mul x20, x11, x16\n\t" - "umulh x5, x11, x16\n\t" - "adds x4, x4, x20\n\t" - "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x20, x12, x15\n\t" - "umulh x21, x12, x15\n\t" - "adds x4, x4, x20\n\t" - "adcs x5, x5, x21\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x20, x11, x17\n\t" - "umulh x21, x11, x17\n\t" - "adds x5, x5, x20\n\t" - "adc x6, x6, x21\n\t" - /* A[1] * B[1] */ - "mul x20, x12, x16\n\t" - "umulh x21, x12, x16\n\t" - "adds x5, x5, x20\n\t" - "adcs x6, x6, x21\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x20, x13, x15\n\t" - "umulh x21, x13, x15\n\t" - "adds x5, x5, x20\n\t" - "adcs x6, x6, x21\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x20, x11, x19\n\t" - "umulh x21, x11, x19\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x20, x12, x17\n\t" - "umulh x21, x12, x17\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x20, x13, x16\n\t" - "umulh x21, x13, x16\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x20, x14, x15\n\t" - "umulh x21, x14, x15\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x20, x12, x19\n\t" - "umulh x21, x12, x19\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x20, x13, x17\n\t" - "umulh x21, x13, x17\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x20, x14, x16\n\t" - "umulh x21, x14, x16\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x20, x13, x19\n\t" - "umulh x21, x13, x19\n\t" - "adds x8, x8, x20\n\t" - "adcs x9, x9, x21\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x20, x14, x17\n\t" - "umulh x21, x14, x17\n\t" - "adds x8, x8, x20\n\t" - "adcs x9, x9, x21\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x20, x14, x19\n\t" - "umulh x21, x14, x19\n\t" - "adds x9, x9, x20\n\t" - "adc x10, x10, x21\n\t" + /* Loop: 50 times */ + "mov x23, #50\n\t" + "ldp x6, x7, [x29, #48]\n\t" + "ldp x8, x9, [x29, #64]\n\t" + "\n" + "L_fe_pow22523_7_%=: \n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x12, x6, x7\n\t" + "mul x11, x6, x7\n\t" + /* A[0] * A[3] */ + "umulh x14, x6, x9\n\t" + "mul x13, x6, x9\n\t" + /* A[0] * A[2] */ + "mul x3, x6, x8\n\t" + "adds x12, x12, x3\n\t" + "umulh x4, x6, x8\n\t" + "adcs x13, x13, x4\n\t" + /* A[1] * A[3] */ + "mul x3, x7, x9\n\t" + "adcs x14, x14, x3\n\t" + "umulh x15, x7, x9\n\t" + "adc x15, x15, xzr\n\t" + /* A[1] * A[2] */ + "mul x3, x7, x8\n\t" + "adds x13, x13, x3\n\t" + "umulh x4, x7, x8\n\t" + "adcs x14, x14, x4\n\t" + /* A[2] * A[3] */ + "mul x3, x8, x9\n\t" + "adcs x15, x15, x3\n\t" + "umulh x16, x8, x9\n\t" + "adc x16, x16, xzr\n\t" + /* Double */ + "adds x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adcs x15, x15, x15\n\t" + "adcs x16, x16, x16\n\t" + "adc x17, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x4, x6, x6\n\t" + "mul x10, x6, x6\n\t" + /* A[1] * A[1] */ + "mul x3, x7, x7\n\t" + "adds x11, x11, x4\n\t" + "umulh x4, x7, x7\n\t" + "adcs x12, x12, x3\n\t" + /* A[2] * A[2] */ + "mul x3, x8, x8\n\t" + "adcs x13, x13, x4\n\t" + "umulh x4, x8, x8\n\t" + "adcs x14, x14, x3\n\t" + /* A[3] * A[3] */ + "mul x3, x9, x9\n\t" + "adcs x15, x15, x4\n\t" + "umulh x4, x9, x9\n\t" + "adcs x16, x16, x3\n\t" + "adc x17, x17, x4\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x20, #19\n\t" - "mul x21, x20, x7\n\t" - "umulh x7, x20, x7\n\t" - "adds x3, x3, x21\n\t" - "mul x21, x20, x8\n\t" - "umulh x8, x20, x8\n\t" - "adcs x4, x4, x21\n\t" - "mul x21, x20, x9\n\t" - "umulh x9, x20, x9\n\t" - "adcs x5, x5, x21\n\t" - "mul x21, x20, x10\n\t" - "umulh x22, x20, x10\n\t" - "adcs x6, x6, x21\n\t" - "adc x22, x22, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x22, x22, xzr\n\t" - /* Overflow */ - "extr x22, x22, x6, #63\n\t" - "mul x22, x22, x20\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x22\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "and x22, x20, x6, asr 63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x22\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" - "ldr x2, [x29, #56]\n\t" - /* Multiply */ - "ldp x11, x12, [x2]\n\t" - "ldp x13, x14, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x15, x11\n\t" - "umulh x4, x15, x11\n\t" - /* A[0] * B[1] */ - "mul x20, x15, x12\n\t" - "umulh x5, x15, x12\n\t" - "adds x4, x4, x20\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x17\n\t" + "adds x13, x13, x4\n\t" + "umulh x5, x3, x17\n\t" "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x20, x16, x11\n\t" - "umulh x21, x16, x11\n\t" - "adds x4, x4, x20\n\t" - "adcs x5, x5, x21\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x20, x15, x13\n\t" - "umulh x21, x15, x13\n\t" - "adds x5, x5, x20\n\t" - "adc x6, x6, x21\n\t" - /* A[1] * B[1] */ - "mul x20, x16, x12\n\t" - "umulh x21, x16, x12\n\t" - "adds x5, x5, x20\n\t" - "adcs x6, x6, x21\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x20, x17, x11\n\t" - "umulh x21, x17, x11\n\t" - "adds x5, x5, x20\n\t" - "adcs x6, x6, x21\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x20, x15, x14\n\t" - "umulh x21, x15, x14\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x20, x16, x13\n\t" - "umulh x21, x16, x13\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x20, x17, x12\n\t" - "umulh x21, x17, x12\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x20, x19, x11\n\t" - "umulh x21, x19, x11\n\t" - "adds x6, x6, x20\n\t" - "adcs x7, x7, x21\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x20, x16, x14\n\t" - "umulh x21, x16, x14\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x20, x17, x13\n\t" - "umulh x21, x17, x13\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x20, x19, x12\n\t" - "umulh x21, x19, x12\n\t" - "adds x7, x7, x20\n\t" - "adcs x8, x8, x21\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x20, x17, x14\n\t" - "umulh x21, x17, x14\n\t" - "adds x8, x8, x20\n\t" - "adcs x9, x9, x21\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x20, x19, x13\n\t" - "umulh x21, x19, x13\n\t" - "adds x8, x8, x20\n\t" - "adcs x9, x9, x21\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x20, x19, x14\n\t" - "umulh x21, x19, x14\n\t" - "adds x9, x9, x20\n\t" - "adc x10, x10, x21\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x20, #19\n\t" - "mul x21, x20, x7\n\t" - "umulh x7, x20, x7\n\t" - "adds x3, x3, x21\n\t" - "mul x21, x20, x8\n\t" - "umulh x8, x20, x8\n\t" - "adcs x4, x4, x21\n\t" - "mul x21, x20, x9\n\t" - "umulh x9, x20, x9\n\t" - "adcs x5, x5, x21\n\t" - "mul x21, x20, x10\n\t" - "umulh x22, x20, x10\n\t" - "adcs x6, x6, x21\n\t" - "adc x22, x22, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x22, x22, xzr\n\t" - /* Overflow */ - "extr x22, x22, x6, #63\n\t" - "mul x22, x22, x20\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x22\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "and x22, x20, x6, asr 63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x22\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x13, #63\n\t" + "mul x5, x5, x3\n\t" + "and x13, x13, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x14\n\t" + "adds x10, x10, x4\n\t" + "umulh x14, x3, x14\n\t" + "mul x4, x3, x15\n\t" + "adcs x11, x11, x4\n\t" + "umulh x15, x3, x15\n\t" + "mul x4, x3, x16\n\t" + "adcs x12, x12, x4\n\t" + "umulh x16, x3, x16\n\t" + "adc x13, x13, xzr\n\t" + /* Add high product results in */ + "adds x6, x10, x5\n\t" + "adcs x7, x11, x14\n\t" + "adcs x8, x12, x15\n\t" + "adc x9, x13, x16\n\t" + "subs x23, x23, #1\n\t" + "bne L_fe_pow22523_7_%=\n\t" /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldp x29, x30, [sp], #0x40\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) + "stp x6, x7, [x29, #48]\n\t" + "stp x8, x9, [x29, #64]\n\t" + "add x0, x29, #16\n\t" + "add x1, x29, #48\n\t" + "add x2, x29, #16\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ +#ifndef NDEBUG + "add x0, x29, #16\n\t" +#endif /* !NDEBUG */ + "add x1, x29, #16\n\t" +#ifndef __APPLE__ + "bl fe_sq\n\t" +#else + "bl _fe_sq\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "bl fe_sq\n\t" +#else + "bl _fe_sq\n\t" +#endif /* __APPLE__ */ + "ldr x0, [x29, #112]\n\t" +#ifndef NDEBUG + "add x1, x29, #16\n\t" +#endif /* !NDEBUG */ + "ldr x2, [x29, #120]\n\t" +#ifndef __APPLE__ + "bl fe_mul\n\t" +#else + "bl _fe_mul\n\t" +#endif /* __APPLE__ */ + "ldp x29, x30, [sp], #0x80\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" + : "memory", "x2", "x23", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17" ); } -void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt) +void ge_p1p1_to_p2(ge_p2* r, const ge_p1p1* p) { __asm__ __volatile__ ( - "stp x29, x30, [sp, #-96]!\n\t" + "stp x29, x30, [sp, #-32]!\n\t" "add x29, sp, #0\n\t" - "str %x[ry], [x29, #16]\n\t" - "str %x[rz], [x29, #24]\n\t" - "str %x[rt], [x29, #32]\n\t" - "str %x[px], [x29, #40]\n\t" - "str %x[py], [x29, #48]\n\t" - "str %x[pz], [x29, #56]\n\t" - "str %x[pt], [x29, #64]\n\t" - "ldr x1, [x29, #40]\n\t" - "ldr x2, [x29, #64]\n\t" + "str %x[r], [x29, #16]\n\t" + "str %x[p], [x29, #24]\n\t" + "mov x2, x1\n\t" + "add x1, x1, #0x60\n\t" /* Multiply */ - "ldp x11, x12, [x1]\n\t" - "ldp x13, x14, [x1, #16]\n\t" - "ldp x15, x16, [x2]\n\t" - "ldp x17, x19, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x11, x15\n\t" - "umulh x4, x11, x15\n\t" - /* A[0] * B[1] */ - "mul x24, x11, x16\n\t" - "umulh x5, x11, x16\n\t" - "adds x4, x4, x24\n\t" + "ldp x10, x11, [x1]\n\t" + "ldp x12, x13, [x1, #16]\n\t" + "ldp x6, x7, [x2]\n\t" + "ldp x8, x9, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x15, x10, x6\n\t" + "mul x14, x10, x6\n\t" + /* A[2] * B[0] */ + "umulh x17, x12, x6\n\t" + "mul x16, x12, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x6\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x11, x6\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x11, x9\n\t" + "mul x19, x11, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x7\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x10, x7\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x7\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x12, x7\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x11, x8\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x10, x8\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x7\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x11, x7\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x7\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x13, x7\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x8\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x12, x8\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x9\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x13, x9\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x9\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x10, x9\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x9\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x12, x9\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x6\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x13, x6\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x8\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x13, x8\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x24, x12, x15\n\t" - "umulh x25, x12, x15\n\t" - "adds x4, x4, x24\n\t" - "adcs x5, x5, x25\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x24, x11, x17\n\t" - "umulh x25, x11, x17\n\t" - "adds x5, x5, x24\n\t" - "adc x6, x6, x25\n\t" - /* A[1] * B[1] */ - "mul x24, x12, x16\n\t" - "umulh x25, x12, x16\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x24, x13, x15\n\t" - "umulh x25, x13, x15\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x24, x11, x19\n\t" - "umulh x25, x11, x19\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x24, x12, x17\n\t" - "umulh x25, x12, x17\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x24, x13, x16\n\t" - "umulh x25, x13, x16\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x24, x14, x15\n\t" - "umulh x25, x14, x15\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x24, x12, x19\n\t" - "umulh x25, x12, x19\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x13, x17\n\t" - "umulh x25, x13, x17\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x14, x16\n\t" - "umulh x25, x14, x16\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x13, x19\n\t" - "umulh x25, x13, x19\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x14, x17\n\t" - "umulh x25, x14, x17\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x24, x14, x19\n\t" - "umulh x25, x14, x19\n\t" - "adds x9, x9, x24\n\t" - "adc x10, x10, x25\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x7\n\t" - "umulh x7, x24, x7\n\t" - "adds x3, x3, x25\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adcs x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x26, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "adc x26, x26, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x26, x26, xzr\n\t" - /* Overflow */ - "extr x26, x26, x6, #63\n\t" - "mul x26, x26, x24\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "and x26, x24, x6, asr 63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldr x0, [x29, #32]\n\t" - "ldr x2, [x29, #48]\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "sub x2, x1, #32\n\t" + "add x0, x0, #0x40\n\t" /* Multiply */ - "ldp x20, x21, [x2]\n\t" - "ldp x22, x23, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x11, x20\n\t" - "umulh x4, x11, x20\n\t" - /* A[0] * B[1] */ - "mul x24, x11, x21\n\t" - "umulh x5, x11, x21\n\t" - "adds x4, x4, x24\n\t" - "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x24, x12, x20\n\t" - "umulh x25, x12, x20\n\t" - "adds x4, x4, x24\n\t" - "adcs x5, x5, x25\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x24, x11, x22\n\t" - "umulh x25, x11, x22\n\t" - "adds x5, x5, x24\n\t" - "adc x6, x6, x25\n\t" - /* A[1] * B[1] */ - "mul x24, x12, x21\n\t" - "umulh x25, x12, x21\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x24, x13, x20\n\t" - "umulh x25, x13, x20\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x24, x11, x23\n\t" - "umulh x25, x11, x23\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x24, x12, x22\n\t" - "umulh x25, x12, x22\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x24, x13, x21\n\t" - "umulh x25, x13, x21\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x24, x14, x20\n\t" - "umulh x25, x14, x20\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x24, x12, x23\n\t" - "umulh x25, x12, x23\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x13, x22\n\t" - "umulh x25, x13, x22\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x14, x21\n\t" - "umulh x25, x14, x21\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x13, x23\n\t" - "umulh x25, x13, x23\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x14, x22\n\t" - "umulh x25, x14, x22\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x24, x14, x23\n\t" - "umulh x25, x14, x23\n\t" - "adds x9, x9, x24\n\t" - "adc x10, x10, x25\n\t" + "ldp x6, x7, [x2]\n\t" + "ldp x8, x9, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x15, x10, x6\n\t" + "mul x14, x10, x6\n\t" + /* A[2] * B[0] */ + "umulh x17, x12, x6\n\t" + "mul x16, x12, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x6\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x11, x6\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x11, x9\n\t" + "mul x19, x11, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x7\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x10, x7\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x7\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x12, x7\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x11, x8\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x10, x8\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x7\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x11, x7\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x7\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x13, x7\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x8\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x12, x8\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x9\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x13, x9\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x9\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x10, x9\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x9\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x12, x9\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x6\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x13, x6\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x8\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x13, x8\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x7\n\t" - "umulh x7, x24, x7\n\t" - "adds x3, x3, x25\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adcs x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x26, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "adc x26, x26, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x26, x26, xzr\n\t" - /* Overflow */ - "extr x26, x26, x6, #63\n\t" - "mul x26, x26, x24\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "and x26, x24, x6, asr 63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldr x0, [x29, #16]\n\t" - "ldr x2, [x29, #56]\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "sub x1, x1, #0x40\n\t" + "sub x0, x0, #32\n\t" /* Multiply */ - "ldp x11, x12, [x2]\n\t" - "ldp x13, x14, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x3, x20, x11\n\t" - "umulh x4, x20, x11\n\t" - /* A[0] * B[1] */ - "mul x24, x20, x12\n\t" - "umulh x5, x20, x12\n\t" - "adds x4, x4, x24\n\t" - "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x24, x21, x11\n\t" - "umulh x25, x21, x11\n\t" - "adds x4, x4, x24\n\t" - "adcs x5, x5, x25\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x24, x20, x13\n\t" - "umulh x25, x20, x13\n\t" - "adds x5, x5, x24\n\t" - "adc x6, x6, x25\n\t" - /* A[1] * B[1] */ - "mul x24, x21, x12\n\t" - "umulh x25, x21, x12\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x24, x22, x11\n\t" - "umulh x25, x22, x11\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x24, x20, x14\n\t" - "umulh x25, x20, x14\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x24, x21, x13\n\t" - "umulh x25, x21, x13\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x24, x22, x12\n\t" - "umulh x25, x22, x12\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x24, x23, x11\n\t" - "umulh x25, x23, x11\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x24, x21, x14\n\t" - "umulh x25, x21, x14\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x22, x13\n\t" - "umulh x25, x22, x13\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x23, x12\n\t" - "umulh x25, x23, x12\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x22, x14\n\t" - "umulh x25, x22, x14\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x23, x13\n\t" - "umulh x25, x23, x13\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x24, x23, x14\n\t" - "umulh x25, x23, x14\n\t" - "adds x9, x9, x24\n\t" - "adc x10, x10, x25\n\t" + "ldp x10, x11, [x1]\n\t" + "ldp x12, x13, [x1, #16]\n\t" + /* A[0] * B[0] */ + "umulh x15, x10, x6\n\t" + "mul x14, x10, x6\n\t" + /* A[2] * B[0] */ + "umulh x17, x12, x6\n\t" + "mul x16, x12, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x6\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x11, x6\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x11, x9\n\t" + "mul x19, x11, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x7\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x10, x7\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x7\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x12, x7\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x11, x8\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x10, x8\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x7\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x11, x7\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x7\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x13, x7\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x8\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x12, x8\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x9\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x13, x9\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x9\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x10, x9\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x9\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x12, x9\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x6\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x13, x6\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x8\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x13, x8\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x7\n\t" - "umulh x7, x24, x7\n\t" - "adds x3, x3, x25\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adcs x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x26, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "adc x26, x26, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x26, x26, xzr\n\t" - /* Overflow */ - "extr x26, x26, x6, #63\n\t" - "mul x26, x26, x24\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "and x26, x24, x6, asr 63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "ldp x29, x30, [sp], #32\n\t" + : [r] "+r" (r), [p] "+r" (p) + : + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22" + ); +} + +void ge_p1p1_to_p3(ge_p3* r, const ge_p1p1* p) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" + "str %x[r], [x29, #16]\n\t" + "str %x[p], [x29, #24]\n\t" + "mov x2, x1\n\t" + "add x1, x1, #0x60\n\t" /* Multiply */ - /* A[0] * B[0] */ - "mul x3, x11, x15\n\t" - "umulh x4, x11, x15\n\t" - /* A[0] * B[1] */ - "mul x24, x11, x16\n\t" - "umulh x5, x11, x16\n\t" - "adds x4, x4, x24\n\t" - "adc x5, x5, xzr\n\t" - /* A[1] * B[0] */ - "mul x24, x12, x15\n\t" - "umulh x25, x12, x15\n\t" - "adds x4, x4, x24\n\t" - "adcs x5, x5, x25\n\t" - "adc x6, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x24, x11, x17\n\t" - "umulh x25, x11, x17\n\t" - "adds x5, x5, x24\n\t" - "adc x6, x6, x25\n\t" - /* A[1] * B[1] */ - "mul x24, x12, x16\n\t" - "umulh x25, x12, x16\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x24, x13, x15\n\t" - "umulh x25, x13, x15\n\t" - "adds x5, x5, x24\n\t" - "adcs x6, x6, x25\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * B[3] */ - "mul x24, x11, x19\n\t" - "umulh x25, x11, x19\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x24, x12, x17\n\t" - "umulh x25, x12, x17\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[2] * B[1] */ - "mul x24, x13, x16\n\t" - "umulh x25, x13, x16\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[3] * B[0] */ - "mul x24, x14, x15\n\t" - "umulh x25, x14, x15\n\t" - "adds x6, x6, x24\n\t" - "adcs x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * B[3] */ - "mul x24, x12, x19\n\t" - "umulh x25, x12, x19\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x24, x13, x17\n\t" - "umulh x25, x13, x17\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[3] * B[1] */ - "mul x24, x14, x16\n\t" - "umulh x25, x14, x16\n\t" - "adds x7, x7, x24\n\t" - "adcs x8, x8, x25\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[3] */ - "mul x24, x13, x19\n\t" - "umulh x25, x13, x19\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x24, x14, x17\n\t" - "umulh x25, x14, x17\n\t" - "adds x8, x8, x24\n\t" - "adcs x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[3] */ - "mul x24, x14, x19\n\t" - "umulh x25, x14, x19\n\t" - "adds x9, x9, x24\n\t" - "adc x10, x10, x25\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "extr x7, x7, x6, #63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x24, #19\n\t" - "mul x25, x24, x7\n\t" - "umulh x7, x24, x7\n\t" - "adds x3, x3, x25\n\t" - "mul x25, x24, x8\n\t" - "umulh x8, x24, x8\n\t" - "adcs x4, x4, x25\n\t" - "mul x25, x24, x9\n\t" - "umulh x9, x24, x9\n\t" - "adcs x5, x5, x25\n\t" - "mul x25, x24, x10\n\t" - "umulh x26, x24, x10\n\t" - "adcs x6, x6, x25\n\t" - "adc x26, x26, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x7\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adc x26, x26, xzr\n\t" - /* Overflow */ - "extr x26, x26, x6, #63\n\t" - "mul x26, x26, x24\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Reduce if top bit set */ - "and x26, x24, x6, asr 63\n\t" - "and x6, x6, #0x7fffffffffffffff\n\t" - "adds x3, x3, x26\n\t" - "adcs x4, x4, xzr\n\t" - "adcs x5, x5, xzr\n\t" - "adc x6, x6, xzr\n\t" - /* Store */ - "stp x3, x4, [x0]\n\t" - "stp x5, x6, [x0, #16]\n\t" - "ldp x29, x30, [sp], #0x60\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt) - : - : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26" - ); -} - -void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-80]!\n\t" - "add x29, sp, #0\n\t" - "str %x[rx], [x29, #16]\n\t" - "str %x[ry], [x29, #24]\n\t" - "str %x[rz], [x29, #32]\n\t" - "str %x[rt], [x29, #40]\n\t" - "str %x[px], [x29, #48]\n\t" - "str %x[py], [x29, #56]\n\t" - "str %x[pz], [x29, #64]\n\t" - "ldr x1, [x29, #48]\n\t" - /* Square */ - "ldp x12, x13, [x1]\n\t" - "ldp x14, x15, [x1, #16]\n\t" - /* A[0] * A[1] */ - "mul x5, x12, x13\n\t" - "umulh x6, x12, x13\n\t" - /* A[0] * A[2] */ - "mul x25, x12, x14\n\t" - "umulh x7, x12, x14\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, xzr\n\t" - /* A[0] * A[3] */ - "mul x25, x12, x15\n\t" - "umulh x8, x12, x15\n\t" - "adds x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * A[2] */ - "mul x25, x13, x14\n\t" - "umulh x26, x13, x14\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x25, x13, x15\n\t" - "umulh x26, x13, x15\n\t" - "adds x8, x8, x25\n\t" - "adc x9, x9, x26\n\t" - /* A[2] * A[3] */ - "mul x25, x14, x15\n\t" - "umulh x10, x14, x15\n\t" - "adds x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" - /* Double */ - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adc x11, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x4, x12, x12\n\t" - "umulh x27, x12, x12\n\t" - /* A[1] * A[1] */ - "mul x25, x13, x13\n\t" - "umulh x26, x13, x13\n\t" - "adds x5, x5, x27\n\t" - "adcs x6, x6, x25\n\t" - "adc x27, x26, xzr\n\t" - /* A[2] * A[2] */ - "mul x25, x14, x14\n\t" - "umulh x26, x14, x14\n\t" - "adds x7, x7, x27\n\t" - "adcs x8, x8, x25\n\t" - "adc x27, x26, xzr\n\t" - /* A[3] * A[3] */ - "mul x25, x15, x15\n\t" - "umulh x26, x15, x15\n\t" - "adds x9, x9, x27\n\t" - "adcs x10, x10, x25\n\t" - "adc x11, x11, x26\n\t" + "ldp x10, x11, [x1]\n\t" + "ldp x12, x13, [x1, #16]\n\t" + "ldp x6, x7, [x2]\n\t" + "ldp x8, x9, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x15, x10, x6\n\t" + "mul x14, x10, x6\n\t" + /* A[2] * B[0] */ + "umulh x17, x12, x6\n\t" + "mul x16, x12, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x6\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x11, x6\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x11, x9\n\t" + "mul x19, x11, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x7\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x10, x7\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x7\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x12, x7\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x11, x8\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x10, x8\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x7\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x11, x7\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x7\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x13, x7\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x8\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x12, x8\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x9\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x13, x9\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x9\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x10, x9\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x9\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x12, x9\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x6\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x13, x6\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x8\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x13, x8\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x8\n\t" - "umulh x8, x25, x8\n\t" - "adds x4, x4, x26\n\t" - "mul x26, x25, x9\n\t" - "umulh x9, x25, x9\n\t" - "adcs x5, x5, x26\n\t" - "mul x26, x25, x10\n\t" - "umulh x10, x25, x10\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x11\n\t" - "umulh x27, x25, x11\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" - "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" /* Store */ - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldr x0, [x29, #32]\n\t" - "ldr x1, [x29, #56]\n\t" - /* Square */ - "ldp x21, x22, [x1]\n\t" - "ldp x23, x24, [x1, #16]\n\t" - /* A[0] * A[1] */ - "mul x9, x21, x22\n\t" - "umulh x10, x21, x22\n\t" - /* A[0] * A[2] */ - "mul x25, x21, x23\n\t" - "umulh x11, x21, x23\n\t" - "adds x10, x10, x25\n\t" - "adc x11, x11, xzr\n\t" - /* A[0] * A[3] */ - "mul x25, x21, x24\n\t" - "umulh x16, x21, x24\n\t" - "adds x11, x11, x25\n\t" - "adc x16, x16, xzr\n\t" - /* A[1] * A[2] */ - "mul x25, x22, x23\n\t" - "umulh x26, x22, x23\n\t" - "adds x11, x11, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x25, x22, x24\n\t" - "umulh x26, x22, x24\n\t" - "adds x16, x16, x25\n\t" - "adc x17, x17, x26\n\t" - /* A[2] * A[3] */ - "mul x25, x23, x24\n\t" - "umulh x19, x23, x24\n\t" - "adds x17, x17, x25\n\t" - "adc x19, x19, xzr\n\t" - /* Double */ - "adds x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x16, x16, x16\n\t" - "adcs x17, x17, x17\n\t" - "adcs x19, x19, x19\n\t" - "adc x20, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x8, x21, x21\n\t" - "umulh x27, x21, x21\n\t" - /* A[1] * A[1] */ - "mul x25, x22, x22\n\t" - "umulh x26, x22, x22\n\t" - "adds x9, x9, x27\n\t" - "adcs x10, x10, x25\n\t" - "adc x27, x26, xzr\n\t" - /* A[2] * A[2] */ - "mul x25, x23, x23\n\t" - "umulh x26, x23, x23\n\t" - "adds x11, x11, x27\n\t" - "adcs x16, x16, x25\n\t" - "adc x27, x26, xzr\n\t" - /* A[3] * A[3] */ - "mul x25, x24, x24\n\t" - "umulh x26, x24, x24\n\t" - "adds x17, x17, x27\n\t" - "adcs x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "sub x1, x1, #0x40\n\t" + "add x0, x0, #0x60\n\t" + /* Multiply */ + "ldp x23, x24, [x1]\n\t" + "ldp x25, x26, [x1, #16]\n\t" + /* A[0] * B[0] */ + "umulh x15, x23, x6\n\t" + "mul x14, x23, x6\n\t" + /* A[2] * B[0] */ + "umulh x17, x25, x6\n\t" + "mul x16, x25, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x24, x6\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x24, x6\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x24, x9\n\t" + "mul x19, x24, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x23, x7\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x23, x7\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x25, x7\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x25, x7\n\t" + "adcs x19, x19, x4\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x24, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x24, x8\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x23, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x23, x8\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x24, x7\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x24, x7\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x26, x7\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x26, x7\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x25, x8\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x25, x8\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x26, x9\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x26, x9\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x23, x9\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x23, x9\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x25, x9\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x25, x9\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x26, x6\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x26, x6\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x26, x8\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x26, x8\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x11, #63\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x8, x8, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x9, x9, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x10, x10, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x11, x11, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x9, x9, x16\n\t" - "adcs x10, x10, x17\n\t" - "adcs x11, x11, x19\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x11, #63\n\t" - "mul x27, x27, x25\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x27\n\t" - "adcs x9, x9, xzr\n\t" - "adcs x10, x10, xzr\n\t" - "adc x11, x11, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x11, asr 63\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x27\n\t" - "adcs x9, x9, xzr\n\t" - "adcs x10, x10, xzr\n\t" - "adc x11, x11, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" /* Store */ - "stp x8, x9, [x0]\n\t" - "stp x10, x11, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" - /* Add */ - "adds x12, x12, x21\n\t" - "adcs x13, x13, x22\n\t" - "adcs x14, x14, x23\n\t" - "adc x15, x15, x24\n\t" - "mov x25, #-19\n\t" - "asr x28, x15, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x12, x12, x25\n\t" - "sbcs x13, x13, x28\n\t" - "sbcs x14, x14, x28\n\t" - "sbc x15, x15, x26\n\t" - "ldr x0, [x29, #40]\n\t" - /* Square */ - /* A[0] * A[1] */ - "mul x17, x12, x13\n\t" - "umulh x19, x12, x13\n\t" - /* A[0] * A[2] */ - "mul x25, x12, x14\n\t" - "umulh x20, x12, x14\n\t" - "adds x19, x19, x25\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "add x2, x1, #32\n\t" + "sub x0, x0, #0x40\n\t" + /* Multiply */ + "ldp x6, x7, [x2]\n\t" + "ldp x8, x9, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x15, x23, x6\n\t" + "mul x14, x23, x6\n\t" + /* A[2] * B[0] */ + "umulh x17, x25, x6\n\t" + "mul x16, x25, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x24, x6\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x24, x6\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x24, x9\n\t" + "mul x19, x24, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x23, x7\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x23, x7\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x25, x7\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x25, x7\n\t" + "adcs x19, x19, x4\n\t" "adc x20, x20, xzr\n\t" - /* A[0] * A[3] */ - "mul x25, x12, x15\n\t" - "umulh x21, x12, x15\n\t" - "adds x20, x20, x25\n\t" + /* A[1] * B[2] */ + "mul x3, x24, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x24, x8\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x23, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x23, x8\n\t" + "adcs x17, x17, x4\n\t" + "adcs x19, x19, xzr\n\t" + "adcs x20, x20, xzr\n\t" "adc x21, x21, xzr\n\t" - /* A[1] * A[2] */ - "mul x25, x13, x14\n\t" - "umulh x26, x13, x14\n\t" - "adds x20, x20, x25\n\t" - "adcs x21, x21, x26\n\t" - "adc x22, xzr, xzr\n\t" - /* A[1] * A[3] */ - "mul x25, x13, x15\n\t" - "umulh x26, x13, x15\n\t" - "adds x21, x21, x25\n\t" - "adc x22, x22, x26\n\t" - /* A[2] * A[3] */ - "mul x25, x14, x15\n\t" - "umulh x23, x14, x15\n\t" - "adds x22, x22, x25\n\t" - "adc x23, x23, xzr\n\t" - /* Double */ - "adds x17, x17, x17\n\t" - "adcs x19, x19, x19\n\t" - "adcs x20, x20, x20\n\t" - "adcs x21, x21, x21\n\t" - "adcs x22, x22, x22\n\t" - "adcs x23, x23, x23\n\t" - "adc x24, xzr, xzr\n\t" - /* A[0] * A[0] */ - "mul x16, x12, x12\n\t" - "umulh x27, x12, x12\n\t" - /* A[1] * A[1] */ - "mul x25, x13, x13\n\t" - "umulh x26, x13, x13\n\t" - "adds x17, x17, x27\n\t" - "adcs x19, x19, x25\n\t" - "adc x27, x26, xzr\n\t" - /* A[2] * A[2] */ - "mul x25, x14, x14\n\t" - "umulh x26, x14, x14\n\t" - "adds x20, x20, x27\n\t" - "adcs x21, x21, x25\n\t" - "adc x27, x26, xzr\n\t" - /* A[3] * A[3] */ - "mul x25, x15, x15\n\t" - "umulh x26, x15, x15\n\t" - "adds x22, x22, x27\n\t" - "adcs x23, x23, x25\n\t" - "adc x24, x24, x26\n\t" + /* A[1] * B[1] */ + "mul x3, x24, x7\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x24, x7\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x26, x7\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x26, x7\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x25, x8\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x25, x8\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x26, x9\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x26, x9\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x23, x9\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x23, x9\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x25, x9\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x25, x9\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x26, x6\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x26, x6\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x26, x8\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x26, x8\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x24, x24, x23, #63\n\t" - "extr x23, x23, x22, #63\n\t" - "extr x22, x22, x21, #63\n\t" - "extr x21, x21, x20, #63\n\t" - "and x20, x20, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x21\n\t" - "umulh x21, x25, x21\n\t" - "adds x16, x16, x26\n\t" - "mul x26, x25, x22\n\t" - "umulh x22, x25, x22\n\t" - "adcs x17, x17, x26\n\t" - "mul x26, x25, x23\n\t" - "umulh x23, x25, x23\n\t" - "adcs x19, x19, x26\n\t" - "mul x26, x25, x24\n\t" - "umulh x27, x25, x24\n\t" - "adcs x20, x20, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x17, x17, x21\n\t" - "adcs x19, x19, x22\n\t" - "adcs x20, x20, x23\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x20, #63\n\t" - "mul x27, x27, x25\n\t" - "and x20, x20, #0x7fffffffffffffff\n\t" - "adds x16, x16, x27\n\t" - "adcs x17, x17, xzr\n\t" - "adcs x19, x19, xzr\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" + /* Store */ + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "add x1, x1, #0x40\n\t" + "add x0, x0, #32\n\t" + /* Multiply */ + /* A[0] * B[0] */ + "umulh x15, x10, x6\n\t" + "mul x14, x10, x6\n\t" + /* A[2] * B[0] */ + "umulh x17, x12, x6\n\t" + "mul x16, x12, x6\n\t" + /* A[1] * B[0] */ + "mul x3, x11, x6\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x11, x6\n\t" + "adcs x16, x16, x4\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[3] */ + "umulh x20, x11, x9\n\t" + "mul x19, x11, x9\n\t" + /* A[0] * B[1] */ + "mul x3, x10, x7\n\t" + "adds x15, x15, x3\n\t" + "umulh x4, x10, x7\n\t" + "adcs x16, x16, x4\n\t" + /* A[2] * B[1] */ + "mul x3, x12, x7\n\t" + "adcs x17, x17, x3\n\t" + "umulh x4, x12, x7\n\t" + "adcs x19, x19, x4\n\t" "adc x20, x20, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x20, asr 63\n\t" - "and x20, x20, #0x7fffffffffffffff\n\t" - "adds x16, x16, x27\n\t" - "adcs x17, x17, xzr\n\t" + /* A[1] * B[2] */ + "mul x3, x11, x8\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x11, x8\n\t" + "adcs x19, x19, x4\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x3, x10, x8\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x10, x8\n\t" + "adcs x17, x17, x4\n\t" "adcs x19, x19, xzr\n\t" - "adc x20, x20, xzr\n\t" + "adcs x20, x20, xzr\n\t" + "adc x21, x21, xzr\n\t" + /* A[1] * B[1] */ + "mul x3, x11, x7\n\t" + "adds x16, x16, x3\n\t" + "umulh x4, x11, x7\n\t" + "adcs x17, x17, x4\n\t" + /* A[3] * B[1] */ + "mul x3, x13, x7\n\t" + "adcs x19, x19, x3\n\t" + "umulh x4, x13, x7\n\t" + "adcs x20, x20, x4\n\t" + "adc x21, x21, xzr\n\t" + /* A[2] * B[2] */ + "mul x3, x12, x8\n\t" + "adds x19, x19, x3\n\t" + "umulh x4, x12, x8\n\t" + "adcs x20, x20, x4\n\t" + /* A[3] * B[3] */ + "mul x3, x13, x9\n\t" + "adcs x21, x21, x3\n\t" + "umulh x22, x13, x9\n\t" + "adc x22, x22, xzr\n\t" + /* A[0] * B[3] */ + "mul x3, x10, x9\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x10, x9\n\t" + "adcs x19, x19, x4\n\t" + /* A[2] * B[3] */ + "mul x3, x12, x9\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x12, x9\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* A[3] * B[0] */ + "mul x3, x13, x6\n\t" + "adds x17, x17, x3\n\t" + "umulh x4, x13, x6\n\t" + "adcs x19, x19, x4\n\t" + /* A[3] * B[2] */ + "mul x3, x13, x8\n\t" + "adcs x20, x20, x3\n\t" + "umulh x4, x13, x8\n\t" + "adcs x21, x21, x4\n\t" + "adc x22, x22, xzr\n\t" + /* Reduce */ + "mov x3, #38\n\t" + "mul x4, x3, x22\n\t" + "adds x17, x17, x4\n\t" + "umulh x5, x3, x22\n\t" + "adc x5, x5, xzr\n\t" + "mov x3, #19\n\t" + "extr x5, x5, x17, #63\n\t" + "mul x5, x5, x3\n\t" + "and x17, x17, #0x7fffffffffffffff\n\t" + "mov x3, #38\n\t" + "mul x4, x3, x19\n\t" + "adds x14, x14, x4\n\t" + "umulh x19, x3, x19\n\t" + "mul x4, x3, x20\n\t" + "adcs x15, x15, x4\n\t" + "umulh x20, x3, x20\n\t" + "mul x4, x3, x21\n\t" + "adcs x16, x16, x4\n\t" + "umulh x21, x3, x21\n\t" + "adc x17, x17, xzr\n\t" + /* Add high product results in */ + "adds x14, x14, x5\n\t" + "adcs x15, x15, x19\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" /* Store */ - "stp x16, x17, [x0]\n\t" - "stp x19, x20, [x0, #16]\n\t" - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #32]\n\t" - /* Add */ - "adds x12, x8, x4\n\t" - "adcs x13, x9, x5\n\t" - "adcs x14, x10, x6\n\t" - "adc x15, x11, x7\n\t" - "mov x25, #-19\n\t" - "asr x28, x15, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x12, x12, x25\n\t" - "sbcs x13, x13, x28\n\t" - "sbcs x14, x14, x28\n\t" - "sbc x15, x15, x26\n\t" - /* Sub */ - "subs x21, x8, x4\n\t" - "sbcs x22, x9, x5\n\t" - "sbcs x23, x10, x6\n\t" - "sbcs x24, x11, x7\n\t" - "mov x25, #-19\n\t" - "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x21, x21, x25\n\t" - "adcs x22, x22, x28\n\t" - "adcs x23, x23, x28\n\t" - "adc x24, x24, x26\n\t" - "stp x12, x13, [x0]\n\t" - "stp x14, x15, [x0, #16]\n\t" - "stp x21, x22, [x1]\n\t" - "stp x23, x24, [x1, #16]\n\t" - "ldr x0, [x29, #16]\n\t" - /* Sub */ - "subs x16, x16, x12\n\t" - "sbcs x17, x17, x13\n\t" - "sbcs x19, x19, x14\n\t" - "sbcs x20, x20, x15\n\t" - "mov x25, #-19\n\t" - "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" - "stp x16, x17, [x0]\n\t" - "stp x19, x20, [x0, #16]\n\t" - "ldr x0, [x29, #40]\n\t" - "ldr x1, [x29, #64]\n\t" - /* Square * 2 */ - "ldp x12, x13, [x1]\n\t" - "ldp x14, x15, [x1, #16]\n\t" + "stp x14, x15, [x0]\n\t" + "stp x16, x17, [x0, #16]\n\t" + "ldp x29, x30, [sp], #32\n\t" + : [r] "+r" (r), [p] "+r" (p) + : + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26" + ); +} + +void ge_p2_dbl(ge_p1p1* r, const ge_p2* p) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" + "str %x[r], [x29, #16]\n\t" + "str %x[p], [x29, #24]\n\t" + "add x0, x0, #0x40\n\t" + /* Square */ + "ldp x4, x5, [x1]\n\t" + "ldp x6, x7, [x1, #16]\n\t" /* A[0] * A[1] */ - "mul x5, x12, x13\n\t" - "umulh x6, x12, x13\n\t" - /* A[0] * A[2] */ - "mul x25, x12, x14\n\t" - "umulh x7, x12, x14\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, xzr\n\t" + "umulh x10, x4, x5\n\t" + "mul x9, x4, x5\n\t" /* A[0] * A[3] */ - "mul x25, x12, x15\n\t" - "umulh x8, x12, x15\n\t" - "adds x7, x7, x25\n\t" - "adc x8, x8, xzr\n\t" - /* A[1] * A[2] */ - "mul x25, x13, x14\n\t" - "umulh x26, x13, x14\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, xzr, xzr\n\t" + "umulh x12, x4, x7\n\t" + "mul x11, x4, x7\n\t" + /* A[0] * A[2] */ + "mul x25, x4, x6\n\t" + "adds x10, x10, x25\n\t" + "umulh x26, x4, x6\n\t" + "adcs x11, x11, x26\n\t" /* A[1] * A[3] */ - "mul x25, x13, x15\n\t" - "umulh x26, x13, x15\n\t" - "adds x8, x8, x25\n\t" - "adc x9, x9, x26\n\t" + "mul x25, x5, x7\n\t" + "adcs x12, x12, x25\n\t" + "umulh x13, x5, x7\n\t" + "adc x13, x13, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x5, x6\n\t" + "adds x11, x11, x25\n\t" + "umulh x26, x5, x6\n\t" + "adcs x12, x12, x26\n\t" /* A[2] * A[3] */ - "mul x25, x14, x15\n\t" - "umulh x10, x14, x15\n\t" - "adds x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" + "mul x25, x6, x7\n\t" + "adcs x13, x13, x25\n\t" + "umulh x14, x6, x7\n\t" + "adc x14, x14, xzr\n\t" /* Double */ - "adds x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "adcs x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" + "adds x9, x9, x9\n\t" "adcs x10, x10, x10\n\t" - "adc x11, xzr, xzr\n\t" + "adcs x11, x11, x11\n\t" + "adcs x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adc x15, xzr, xzr\n\t" /* A[0] * A[0] */ - "mul x4, x12, x12\n\t" - "umulh x28, x12, x12\n\t" + "umulh x26, x4, x4\n\t" + "mul x8, x4, x4\n\t" /* A[1] * A[1] */ - "mul x25, x13, x13\n\t" - "umulh x26, x13, x13\n\t" - "adds x5, x5, x28\n\t" - "adcs x6, x6, x25\n\t" - "adc x28, x26, xzr\n\t" + "mul x25, x5, x5\n\t" + "adds x9, x9, x26\n\t" + "umulh x26, x5, x5\n\t" + "adcs x10, x10, x25\n\t" /* A[2] * A[2] */ - "mul x25, x14, x14\n\t" - "umulh x26, x14, x14\n\t" - "adds x7, x7, x28\n\t" - "adcs x8, x8, x25\n\t" - "adc x28, x26, xzr\n\t" + "mul x25, x6, x6\n\t" + "adcs x11, x11, x26\n\t" + "umulh x26, x6, x6\n\t" + "adcs x12, x12, x25\n\t" /* A[3] * A[3] */ - "mul x25, x15, x15\n\t" - "umulh x26, x15, x15\n\t" - "adds x9, x9, x28\n\t" - "adcs x10, x10, x25\n\t" - "adc x11, x11, x26\n\t" - /* Double and Reduce */ - "mov x25, #0x169\n\t" - /* Move top half into t4-t7 and remove top bit from t3 */ - "lsr x28, x11, #61\n\t" - "extr x11, x11, x10, #62\n\t" - "extr x10, x10, x9, #62\n\t" - "extr x9, x9, x8, #62\n\t" - "extr x8, x8, x7, #62\n\t" - "extr x7, x7, x6, #63\n\t" - "extr x6, x6, x5, #63\n\t" - "extr x5, x5, x4, #63\n\t" - "lsl x4, x4, #1\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Two left, only one right */ - "and x11, x11, #0x7fffffffffffffff\n\t" - /* Multiply top bits by 19*19 */ - "mul x28, x28, x25\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x8\n\t" - "umulh x8, x25, x8\n\t" - "adds x4, x4, x26\n\t" - "mul x26, x25, x9\n\t" - "umulh x9, x25, x9\n\t" - "adcs x5, x5, x26\n\t" - "mul x26, x25, x10\n\t" - "umulh x10, x25, x10\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x11\n\t" - "umulh x27, x25, x11\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x4, x4, x28\n\t" - "adcs x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" - "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "ldr x0, [x29, #40]\n\t" - /* Sub */ - "subs x4, x4, x21\n\t" - "sbcs x5, x5, x22\n\t" - "sbcs x6, x6, x23\n\t" - "sbcs x7, x7, x24\n\t" - "mov x25, #-19\n\t" - "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x4, x4, x25\n\t" - "adcs x5, x5, x28\n\t" - "adcs x6, x6, x28\n\t" - "adc x7, x7, x26\n\t" - "stp x4, x5, [x0]\n\t" - "stp x6, x7, [x0, #16]\n\t" - "ldp x29, x30, [sp], #0x50\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz) - : - : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" - ); -} - -void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-112]!\n\t" - "add x29, sp, #0\n\t" - "str %x[qyminusx], [sp, #104]\n\t" - "str %x[qyplusx], [sp, #96]\n\t" - "str %x[qxy2d], [sp, #88]\n\t" - "str %x[rx], [x29, #16]\n\t" - "str %x[ry], [x29, #24]\n\t" - "str %x[rz], [x29, #32]\n\t" - "str %x[rt], [x29, #40]\n\t" - "str %x[px], [x29, #48]\n\t" - "str %x[py], [x29, #56]\n\t" - "str %x[pz], [x29, #64]\n\t" - "str %x[pt], [x29, #72]\n\t" - "ldr x2, [x29, #56]\n\t" - "ldr x3, [x29, #48]\n\t" - /* Add */ - "ldp x12, x13, [x2]\n\t" - "ldp x14, x15, [x2, #16]\n\t" - "ldp x16, x17, [x3]\n\t" - "ldp x19, x20, [x3, #16]\n\t" - "adds x4, x12, x16\n\t" - "adcs x5, x13, x17\n\t" - "adcs x6, x14, x19\n\t" - "adc x7, x15, x20\n\t" - "mov x25, #-19\n\t" - "asr x28, x7, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x4, x4, x25\n\t" - "sbcs x5, x5, x28\n\t" - "sbcs x6, x6, x28\n\t" - "sbc x7, x7, x26\n\t" - /* Sub */ - "subs x8, x12, x16\n\t" - "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x19\n\t" - "sbcs x11, x15, x20\n\t" - "mov x25, #-19\n\t" - "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x8, x8, x25\n\t" - "adcs x9, x9, x28\n\t" - "adcs x10, x10, x28\n\t" - "adc x11, x11, x26\n\t" - "ldr x0, [x29, #32]\n\t" - "ldr x2, [sp, #96]\n\t" - /* Multiply */ - "ldp x21, x22, [x2]\n\t" - "ldp x23, x24, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x12, x4, x21\n\t" - "umulh x13, x4, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x4, x22\n\t" - "umulh x14, x4, x22\n\t" - "adds x13, x13, x25\n\t" - "adc x14, x14, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x5, x21\n\t" - "umulh x26, x5, x21\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x4, x23\n\t" - "umulh x26, x4, x23\n\t" - "adds x14, x14, x25\n\t" + "mul x25, x7, x7\n\t" + "adcs x13, x13, x26\n\t" + "umulh x26, x7, x7\n\t" + "adcs x14, x14, x25\n\t" "adc x15, x15, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x5, x22\n\t" - "umulh x26, x5, x22\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x6, x21\n\t" - "umulh x26, x6, x21\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x4, x24\n\t" - "umulh x26, x4, x24\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x5, x23\n\t" - "umulh x26, x5, x23\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x6, x22\n\t" - "umulh x26, x6, x22\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x7, x21\n\t" - "umulh x26, x7, x21\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x5, x24\n\t" - "umulh x26, x5, x24\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x6, x23\n\t" - "umulh x26, x6, x23\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x7, x22\n\t" - "umulh x26, x7, x22\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x6, x24\n\t" - "umulh x26, x6, x24\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x7, x23\n\t" - "umulh x26, x7, x23\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x7, x24\n\t" - "umulh x26, x7, x24\n\t" - "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x15, #63\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x12, x12, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x13, x13, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x14, x14, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x15, x15, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x13, x13, x16\n\t" - "adcs x14, x14, x17\n\t" - "adcs x15, x15, x19\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x15\n\t" + "adds x11, x11, x26\n\t" + "umulh x27, x25, x15\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x15, #63\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x11, #63\n\t" "mul x27, x27, x25\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x15, asr 63\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x12\n\t" + "adds x8, x8, x26\n\t" + "umulh x12, x25, x12\n\t" + "mul x26, x25, x13\n\t" + "adcs x9, x9, x26\n\t" + "umulh x13, x25, x13\n\t" + "mul x26, x25, x14\n\t" + "adcs x10, x10, x26\n\t" + "umulh x14, x25, x14\n\t" + "adc x11, x11, xzr\n\t" + /* Add high product results in */ + "adds x8, x8, x27\n\t" + "adcs x9, x9, x12\n\t" + "adcs x10, x10, x13\n\t" + "adc x11, x11, x14\n\t" /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [sp, #104]\n\t" - /* Multiply */ - "ldp x21, x22, [x1]\n\t" - "ldp x23, x24, [x1, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x8, x21\n\t" - "umulh x5, x8, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x8, x22\n\t" - "umulh x6, x8, x22\n\t" - "adds x5, x5, x25\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "add x2, x1, #32\n\t" + "sub x0, x0, #32\n\t" + /* Square */ + "ldp x16, x17, [x2]\n\t" + "ldp x19, x20, [x2, #16]\n\t" + /* A[0] * A[1] */ + "umulh x23, x16, x17\n\t" + "mul x22, x16, x17\n\t" + /* A[0] * A[3] */ + "umulh x4, x16, x20\n\t" + "mul x24, x16, x20\n\t" + /* A[0] * A[2] */ + "mul x25, x16, x19\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x16, x19\n\t" + "adcs x24, x24, x26\n\t" + /* A[1] * A[3] */ + "mul x25, x17, x20\n\t" + "adcs x4, x4, x25\n\t" + "umulh x5, x17, x20\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x17, x19\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x17, x19\n\t" + "adcs x4, x4, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x19, x20\n\t" + "adcs x5, x5, x25\n\t" + "umulh x6, x19, x20\n\t" "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x9, x21\n\t" - "umulh x26, x9, x21\n\t" - "adds x5, x5, x25\n\t" - "adcs x6, x6, x26\n\t" + /* Double */ + "adds x22, x22, x22\n\t" + "adcs x23, x23, x23\n\t" + "adcs x24, x24, x24\n\t" + "adcs x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x8, x23\n\t" - "umulh x26, x8, x23\n\t" - "adds x6, x6, x25\n\t" + /* A[0] * A[0] */ + "umulh x26, x16, x16\n\t" + "mul x21, x16, x16\n\t" + /* A[1] * A[1] */ + "mul x25, x17, x17\n\t" + "adds x22, x22, x26\n\t" + "umulh x26, x17, x17\n\t" + "adcs x23, x23, x25\n\t" + /* A[2] * A[2] */ + "mul x25, x19, x19\n\t" + "adcs x24, x24, x26\n\t" + "umulh x26, x19, x19\n\t" + "adcs x4, x4, x25\n\t" + /* A[3] * A[3] */ + "mul x25, x20, x20\n\t" + "adcs x5, x5, x26\n\t" + "umulh x26, x20, x20\n\t" + "adcs x6, x6, x25\n\t" "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x9, x22\n\t" - "umulh x26, x9, x22\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x10, x21\n\t" - "umulh x26, x10, x21\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x8, x24\n\t" - "umulh x26, x8, x24\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x9, x23\n\t" - "umulh x26, x9, x23\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x10, x22\n\t" - "umulh x26, x10, x22\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x11, x21\n\t" - "umulh x26, x11, x21\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x9, x24\n\t" - "umulh x26, x9, x24\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x10, x23\n\t" - "umulh x26, x10, x23\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x11, x22\n\t" - "umulh x26, x11, x22\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x10, x24\n\t" - "umulh x26, x10, x24\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x11, x23\n\t" - "umulh x26, x11, x23\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x11, x24\n\t" - "umulh x26, x11, x24\n\t" - "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x4, x4, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x5, x5, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x16\n\t" - "adcs x6, x6, x17\n\t" - "adcs x7, x7, x19\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x7\n\t" + "adds x24, x24, x26\n\t" + "umulh x27, x25, x7\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x24, #63\n\t" "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x4\n\t" + "adds x21, x21, x26\n\t" + "umulh x4, x25, x4\n\t" + "mul x26, x25, x5\n\t" + "adcs x22, x22, x26\n\t" + "umulh x5, x25, x5\n\t" + "mul x26, x25, x6\n\t" + "adcs x23, x23, x26\n\t" + "umulh x6, x25, x6\n\t" + "adc x24, x24, xzr\n\t" + /* Add high product results in */ + "adds x21, x21, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x23, x23, x5\n\t" + "adc x24, x24, x6\n\t" + "add x3, x0, #32\n\t" + "mov x2, x0\n\t" + "add x1, x0, #32\n\t" + /* Add */ + "adds x4, x21, x8\n\t" + "adcs x5, x22, x9\n\t" + "adcs x6, x23, x10\n\t" + "adcs x7, x24, x11\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x7, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x4, x4, x25\n\t" "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" - /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #16]\n\t" - /* Add */ - "adds x8, x12, x4\n\t" - "adcs x9, x13, x5\n\t" - "adcs x10, x14, x6\n\t" - "adc x11, x15, x7\n\t" + /* Sub */ + "subs x12, x21, x8\n\t" + "sbcs x13, x22, x9\n\t" + "sbcs x14, x23, x10\n\t" + "sbcs x15, x24, x11\n\t" + "csetm x28, cc\n\t" "mov x25, #-19\n\t" - "asr x28, x11, #63\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" + /* Add modulus (if underflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" + "ldr x1, [x29, #24]\n\t" + "add x2, x1, #32\n\t" + "sub x0, x0, #32\n\t" + /* Add */ + "ldp x8, x9, [x1]\n\t" + "ldp x10, x11, [x1, #16]\n\t" + "adds x8, x8, x16\n\t" + "adcs x9, x9, x17\n\t" + "adcs x10, x10, x19\n\t" + "adcs x11, x11, x20\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "extr x28, x28, x11, #63\n\t" + "mul x25, x28, x25\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x25\n\t" - "sbcs x9, x9, x28\n\t" - "sbcs x10, x10, x28\n\t" - "sbc x11, x11, x26\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, xzr\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + "mov x1, x0\n\t" + /* Square */ + /* A[0] * A[1] */ + "umulh x23, x8, x9\n\t" + "mul x22, x8, x9\n\t" + /* A[0] * A[3] */ + "umulh x4, x8, x11\n\t" + "mul x24, x8, x11\n\t" + /* A[0] * A[2] */ + "mul x25, x8, x10\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x8, x10\n\t" + "adcs x24, x24, x26\n\t" + /* A[1] * A[3] */ + "mul x25, x9, x11\n\t" + "adcs x4, x4, x25\n\t" + "umulh x5, x9, x11\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * A[2] */ + "mul x25, x9, x10\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x9, x10\n\t" + "adcs x4, x4, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x10, x11\n\t" + "adcs x5, x5, x25\n\t" + "umulh x6, x10, x11\n\t" + "adc x6, x6, xzr\n\t" + /* Double */ + "adds x22, x22, x22\n\t" + "adcs x23, x23, x23\n\t" + "adcs x24, x24, x24\n\t" + "adcs x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adc x7, xzr, xzr\n\t" + /* A[0] * A[0] */ + "umulh x26, x8, x8\n\t" + "mul x21, x8, x8\n\t" + /* A[1] * A[1] */ + "mul x25, x9, x9\n\t" + "adds x22, x22, x26\n\t" + "umulh x26, x9, x9\n\t" + "adcs x23, x23, x25\n\t" + /* A[2] * A[2] */ + "mul x25, x10, x10\n\t" + "adcs x24, x24, x26\n\t" + "umulh x26, x10, x10\n\t" + "adcs x4, x4, x25\n\t" + /* A[3] * A[3] */ + "mul x25, x11, x11\n\t" + "adcs x5, x5, x26\n\t" + "umulh x26, x11, x11\n\t" + "adcs x6, x6, x25\n\t" + "adc x7, x7, x26\n\t" + /* Reduce */ + "mov x25, #38\n\t" + "mul x26, x25, x7\n\t" + "adds x24, x24, x26\n\t" + "umulh x27, x25, x7\n\t" + "adc x27, x27, xzr\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x24, #63\n\t" + "mul x27, x27, x25\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x4\n\t" + "adds x21, x21, x26\n\t" + "umulh x4, x25, x4\n\t" + "mul x26, x25, x5\n\t" + "adcs x22, x22, x26\n\t" + "umulh x5, x25, x5\n\t" + "mul x26, x25, x6\n\t" + "adcs x23, x23, x26\n\t" + "umulh x6, x25, x6\n\t" + "adc x24, x24, xzr\n\t" + /* Add high product results in */ + "adds x21, x21, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x23, x23, x5\n\t" + "adc x24, x24, x6\n\t" + "add x2, x0, #32\n\t" /* Sub */ - "subs x16, x12, x4\n\t" - "sbcs x17, x13, x5\n\t" - "sbcs x19, x14, x6\n\t" - "sbcs x20, x15, x7\n\t" - "mov x25, #-19\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "subs x21, x21, x8\n\t" + "sbcs x22, x22, x9\n\t" + "sbcs x23, x23, x10\n\t" + "sbcs x24, x24, x11\n\t" "csetm x28, cc\n\t" + "mov x25, #-19\n\t" /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "extr x28, x28, x24, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" - "stp x8, x9, [x0]\n\t" - "stp x10, x11, [x0, #16]\n\t" - "stp x16, x17, [x1]\n\t" - "stp x19, x20, [x1, #16]\n\t" - "ldr x0, [x29, #40]\n\t" - "ldr x1, [sp, #88]\n\t" - "ldr x3, [x29, #72]\n\t" - /* Multiply */ - "ldp x16, x17, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x3]\n\t" - "ldp x23, x24, [x3, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x16, x21\n\t" - "umulh x5, x16, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x16, x22\n\t" - "umulh x6, x16, x22\n\t" - "adds x5, x5, x25\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x17, x21\n\t" - "umulh x26, x17, x21\n\t" - "adds x5, x5, x25\n\t" - "adcs x6, x6, x26\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x16, x23\n\t" - "umulh x26, x16, x23\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x17, x22\n\t" - "umulh x26, x17, x22\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x19, x21\n\t" - "umulh x26, x19, x21\n\t" + "subs x21, x21, x25\n\t" + "sbcs x22, x22, xzr\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "sbcs x23, x23, xzr\n\t" + "sbc x24, x24, xzr\n\t" + "stp x21, x22, [x0]\n\t" + "stp x23, x24, [x0, #16]\n\t" + "ldr x2, [x29, #24]\n\t" + "add x2, x2, #0x40\n\t" + "add x0, x0, #0x60\n\t" + /* Square * 2 */ + "ldp x16, x17, [x2]\n\t" + "ldp x19, x20, [x2, #16]\n\t" + /* A[0] * A[1] */ + "umulh x6, x16, x17\n\t" + "mul x5, x16, x17\n\t" + /* A[0] * A[3] */ + "umulh x8, x16, x20\n\t" + "mul x7, x16, x20\n\t" + /* A[0] * A[2] */ + "mul x25, x16, x19\n\t" "adds x6, x6, x25\n\t" + "umulh x26, x16, x19\n\t" "adcs x7, x7, x26\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x16, x24\n\t" - "umulh x26, x16, x24\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x17, x23\n\t" - "umulh x26, x17, x23\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x19, x22\n\t" - "umulh x26, x19, x22\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" + /* A[1] * A[3] */ + "mul x25, x17, x20\n\t" + "adcs x8, x8, x25\n\t" + "umulh x9, x17, x20\n\t" "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x20, x21\n\t" - "umulh x26, x20, x21\n\t" + /* A[1] * A[2] */ + "mul x25, x17, x19\n\t" "adds x7, x7, x25\n\t" + "umulh x26, x17, x19\n\t" "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x17, x24\n\t" - "umulh x26, x17, x24\n\t" - "adds x8, x8, x25\n\t" - "adcs x9, x9, x26\n\t" - "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x19, x23\n\t" - "umulh x26, x19, x23\n\t" - "adds x8, x8, x25\n\t" - "adcs x9, x9, x26\n\t" - "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x20, x22\n\t" - "umulh x26, x20, x22\n\t" - "adds x8, x8, x25\n\t" - "adcs x9, x9, x26\n\t" + /* A[2] * A[3] */ + "mul x25, x19, x20\n\t" + "adcs x9, x9, x25\n\t" + "umulh x10, x19, x20\n\t" "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x19, x24\n\t" - "umulh x26, x19, x24\n\t" - "adds x9, x9, x25\n\t" - "adcs x10, x10, x26\n\t" + /* Double */ + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x20, x23\n\t" - "umulh x26, x20, x23\n\t" - "adds x9, x9, x25\n\t" - "adcs x10, x10, x26\n\t" - "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x20, x24\n\t" - "umulh x26, x20, x24\n\t" - "adds x10, x10, x25\n\t" + /* A[0] * A[0] */ + "umulh x26, x16, x16\n\t" + "mul x4, x16, x16\n\t" + /* A[1] * A[1] */ + "mul x25, x17, x17\n\t" + "adds x5, x5, x26\n\t" + "umulh x26, x17, x17\n\t" + "adcs x6, x6, x25\n\t" + /* A[2] * A[2] */ + "mul x25, x19, x19\n\t" + "adcs x7, x7, x26\n\t" + "umulh x26, x19, x19\n\t" + "adcs x8, x8, x25\n\t" + /* A[3] * A[3] */ + "mul x25, x20, x20\n\t" + "adcs x9, x9, x26\n\t" + "umulh x26, x20, x20\n\t" + "adcs x10, x10, x25\n\t" "adc x11, x11, x26\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x7, x7, x26\n\t" + "umulh x27, x25, x11\n\t" + "adc x27, x27, xzr\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" + "mov x25, #38\n\t" "mul x26, x25, x8\n\t" - "umulh x8, x25, x8\n\t" "adds x4, x4, x26\n\t" + "umulh x8, x25, x8\n\t" "mul x26, x25, x9\n\t" - "umulh x9, x25, x9\n\t" "adcs x5, x5, x26\n\t" + "umulh x9, x25, x9\n\t" "mul x26, x25, x10\n\t" - "umulh x10, x25, x10\n\t" "adcs x6, x6, x26\n\t" - "mul x26, x25, x11\n\t" - "umulh x27, x25, x11\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" - "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" + "umulh x10, x25, x10\n\t" "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x4, x4, x27\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, x10\n\t" + "mov x25, #19\n\t" + "lsr x26, x7, #62\n\t" + "extr x7, x7, x6, #63\n\t" + "extr x6, x6, x5, #63\n\t" + "extr x5, x5, x4, #63\n\t" + "lsl x4, x4, #1\n\t" + "mul x26, x26, x25\n\t" + "adds x4, x4, x26\n\t" "adcs x5, x5, xzr\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" "adcs x6, x6, xzr\n\t" "adc x7, x7, xzr\n\t" /* Store */ - "ldr x0, [x29, #32]\n\t" - "ldr x1, [x29, #64]\n\t" - /* Double */ - "ldp x8, x9, [x1]\n\t" - "ldp x10, x11, [x1, #16]\n\t" - "adds x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adc x11, x11, x11\n\t" - "mov x25, #-19\n\t" - "asr x28, x11, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x8, x8, x25\n\t" - "sbcs x9, x9, x28\n\t" - "sbcs x10, x10, x28\n\t" - "sbc x11, x11, x26\n\t" - "ldr x1, [x29, #40]\n\t" - /* Add */ - "adds x12, x8, x4\n\t" - "adcs x13, x9, x5\n\t" - "adcs x14, x10, x6\n\t" - "adc x15, x11, x7\n\t" - "mov x25, #-19\n\t" - "asr x28, x15, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x12, x12, x25\n\t" - "sbcs x13, x13, x28\n\t" - "sbcs x14, x14, x28\n\t" - "sbc x15, x15, x26\n\t" + "sub x1, x0, #32\n\t" /* Sub */ - "subs x16, x8, x4\n\t" - "sbcs x17, x9, x5\n\t" - "sbcs x19, x10, x6\n\t" - "sbcs x20, x11, x7\n\t" - "mov x25, #-19\n\t" + "subs x4, x4, x12\n\t" + "sbcs x5, x5, x13\n\t" + "sbcs x6, x6, x14\n\t" + "sbcs x7, x7, x15\n\t" "csetm x28, cc\n\t" + "mov x25, #-19\n\t" /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "extr x28, x28, x7, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" - "stp x12, x13, [x0]\n\t" - "stp x14, x15, [x0, #16]\n\t" - "stp x16, x17, [x1]\n\t" - "stp x19, x20, [x1, #16]\n\t" - "ldp x29, x30, [sp], #0x70\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qxy2d] "+r" (qxy2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx) + "subs x4, x4, x25\n\t" + "sbcs x5, x5, xzr\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "sbcs x6, x6, xzr\n\t" + "sbc x7, x7, xzr\n\t" + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "ldp x29, x30, [sp], #32\n\t" + : [r] "+r" (r), [p] "+r" (p) : - : "memory", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } -void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx) +void ge_madd(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) { __asm__ __volatile__ ( - "stp x29, x30, [sp, #-112]!\n\t" + "stp x29, x30, [sp, #-48]!\n\t" "add x29, sp, #0\n\t" - "str %x[qyminusx], [sp, #104]\n\t" - "str %x[qyplusx], [sp, #96]\n\t" - "str %x[qxy2d], [sp, #88]\n\t" - "str %x[rx], [x29, #16]\n\t" - "str %x[ry], [x29, #24]\n\t" - "str %x[rz], [x29, #32]\n\t" - "str %x[rt], [x29, #40]\n\t" - "str %x[px], [x29, #48]\n\t" - "str %x[py], [x29, #56]\n\t" - "str %x[pz], [x29, #64]\n\t" - "str %x[pt], [x29, #72]\n\t" - "ldr x2, [x29, #56]\n\t" - "ldr x3, [x29, #48]\n\t" + "str %x[r], [x29, #16]\n\t" + "str %x[p], [x29, #24]\n\t" + "str %x[q], [x29, #32]\n\t" + "mov x3, x1\n\t" + "add x2, x1, #32\n\t" + "add x1, x0, #32\n\t" /* Add */ - "ldp x12, x13, [x2]\n\t" - "ldp x14, x15, [x2, #16]\n\t" - "ldp x16, x17, [x3]\n\t" - "ldp x19, x20, [x3, #16]\n\t" - "adds x4, x12, x16\n\t" - "adcs x5, x13, x17\n\t" - "adcs x6, x14, x19\n\t" - "adc x7, x15, x20\n\t" - "mov x25, #-19\n\t" - "asr x28, x7, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "adds x16, x8, x4\n\t" + "adcs x17, x9, x5\n\t" + "adcs x19, x10, x6\n\t" + "adcs x20, x11, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x20, #63\n\t" + "mul x25, x28, x25\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x25\n\t" - "sbcs x5, x5, x28\n\t" - "sbcs x6, x6, x28\n\t" - "sbc x7, x7, x26\n\t" + "adds x16, x16, x25\n\t" + "adcs x17, x17, xzr\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" /* Sub */ - "subs x8, x12, x16\n\t" - "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x19\n\t" - "sbcs x11, x15, x20\n\t" - "mov x25, #-19\n\t" + "subs x12, x8, x4\n\t" + "sbcs x13, x9, x5\n\t" + "sbcs x14, x10, x6\n\t" + "sbcs x15, x11, x7\n\t" "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x8, x8, x25\n\t" - "adcs x9, x9, x28\n\t" - "adcs x10, x10, x28\n\t" - "adc x11, x11, x26\n\t" - "ldr x0, [x29, #32]\n\t" - "ldr x2, [sp, #104]\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "ldr x2, [x29, #32]\n\t" + "mov x1, x0\n\t" /* Multiply */ - "ldp x21, x22, [x2]\n\t" - "ldp x23, x24, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x12, x4, x21\n\t" - "umulh x13, x4, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x4, x22\n\t" - "umulh x14, x4, x22\n\t" - "adds x13, x13, x25\n\t" - "adc x14, x14, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x5, x21\n\t" - "umulh x26, x5, x21\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x4, x23\n\t" - "umulh x26, x4, x23\n\t" - "adds x14, x14, x25\n\t" - "adc x15, x15, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x5, x22\n\t" - "umulh x26, x5, x22\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x6, x21\n\t" - "umulh x26, x6, x21\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x4, x24\n\t" - "umulh x26, x4, x24\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x5, x23\n\t" - "umulh x26, x5, x23\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x6, x22\n\t" - "umulh x26, x6, x22\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x7, x21\n\t" - "umulh x26, x7, x21\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x5, x24\n\t" - "umulh x26, x5, x24\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x6, x23\n\t" - "umulh x26, x6, x23\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x7, x22\n\t" - "umulh x26, x7, x22\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x6, x24\n\t" - "umulh x26, x6, x24\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x7, x23\n\t" - "umulh x26, x7, x23\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x7, x24\n\t" - "umulh x26, x7, x24\n\t" - "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x22, x16, x8\n\t" + "mul x21, x16, x8\n\t" + /* A[2] * B[0] */ + "umulh x24, x19, x8\n\t" + "mul x23, x19, x8\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x8\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x17, x8\n\t" + "adcs x23, x23, x26\n\t" + "adc x24, x24, xzr\n\t" + /* A[1] * B[3] */ + "umulh x5, x17, x11\n\t" + "mul x4, x17, x11\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x9\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x16, x9\n\t" + "adcs x23, x23, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x9\n\t" + "adcs x24, x24, x25\n\t" + "umulh x26, x19, x9\n\t" + "adcs x4, x4, x26\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x10\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x17, x10\n\t" + "adcs x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x10\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x16, x10\n\t" + "adcs x24, x24, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x9\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x17, x9\n\t" + "adcs x24, x24, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x9\n\t" + "adcs x4, x4, x25\n\t" + "umulh x26, x20, x9\n\t" + "adcs x5, x5, x26\n\t" + "adc x6, x6, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x10\n\t" + "adds x4, x4, x25\n\t" + "umulh x26, x19, x10\n\t" + "adcs x5, x5, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x11\n\t" + "adcs x6, x6, x25\n\t" + "umulh x7, x20, x11\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x11\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x16, x11\n\t" + "adcs x4, x4, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x11\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x19, x11\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x8\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x20, x8\n\t" + "adcs x4, x4, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x10\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x20, x10\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x15, #63\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x12, x12, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x13, x13, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x14, x14, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x15, x15, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x13, x13, x16\n\t" - "adcs x14, x14, x17\n\t" - "adcs x15, x15, x19\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x7\n\t" + "adds x24, x24, x26\n\t" + "umulh x27, x25, x7\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x15, #63\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x24, #63\n\t" "mul x27, x27, x25\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x15, asr 63\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" - /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [sp, #96]\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x4\n\t" + "adds x21, x21, x26\n\t" + "umulh x4, x25, x4\n\t" + "mul x26, x25, x5\n\t" + "adcs x22, x22, x26\n\t" + "umulh x5, x25, x5\n\t" + "mul x26, x25, x6\n\t" + "adcs x23, x23, x26\n\t" + "umulh x6, x25, x6\n\t" + "adc x24, x24, xzr\n\t" + /* Add high product results in */ + "adds x21, x21, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x23, x23, x5\n\t" + "adc x24, x24, x6\n\t" + "add x2, x2, #32\n\t" + "add x1, x0, #32\n\t" + "add x0, x0, #32\n\t" /* Multiply */ - "ldp x21, x22, [x1]\n\t" - "ldp x23, x24, [x1, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x8, x21\n\t" - "umulh x5, x8, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x8, x22\n\t" - "umulh x6, x8, x22\n\t" + "ldp x16, x17, [x2]\n\t" + "ldp x19, x20, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x5, x12, x16\n\t" + "mul x4, x12, x16\n\t" + /* A[2] * B[0] */ + "umulh x7, x14, x16\n\t" + "mul x6, x14, x16\n\t" + /* A[1] * B[0] */ + "mul x25, x13, x16\n\t" "adds x5, x5, x25\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x9, x21\n\t" - "umulh x26, x9, x21\n\t" + "umulh x26, x13, x16\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x13, x20\n\t" + "mul x8, x13, x20\n\t" + /* A[0] * B[1] */ + "mul x25, x12, x17\n\t" "adds x5, x5, x25\n\t" + "umulh x26, x12, x17\n\t" "adcs x6, x6, x26\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x8, x23\n\t" - "umulh x26, x8, x23\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x9, x22\n\t" - "umulh x26, x9, x22\n\t" + /* A[2] * B[1] */ + "mul x25, x14, x17\n\t" + "adcs x7, x7, x25\n\t" + "umulh x26, x14, x17\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x13, x19\n\t" + "adds x7, x7, x25\n\t" + "umulh x26, x13, x19\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x12, x19\n\t" "adds x6, x6, x25\n\t" + "umulh x26, x12, x19\n\t" "adcs x7, x7, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x10, x21\n\t" - "umulh x26, x10, x21\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x13, x17\n\t" "adds x6, x6, x25\n\t" + "umulh x26, x13, x17\n\t" "adcs x7, x7, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x8, x24\n\t" - "umulh x26, x8, x24\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x9, x23\n\t" - "umulh x26, x9, x23\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x10, x22\n\t" - "umulh x26, x10, x22\n\t" + /* A[3] * B[1] */ + "mul x25, x15, x17\n\t" + "adcs x8, x8, x25\n\t" + "umulh x26, x15, x17\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "umulh x26, x14, x19\n\t" + "adcs x9, x9, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x15, x20\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x15, x20\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x12, x20\n\t" "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x11, x21\n\t" - "umulh x26, x11, x21\n\t" + "umulh x26, x12, x20\n\t" + "adcs x8, x8, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x14, x20\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x14, x20\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x15, x16\n\t" "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x9, x24\n\t" - "umulh x26, x9, x24\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x10, x23\n\t" - "umulh x26, x10, x23\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x11, x22\n\t" - "umulh x26, x11, x22\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x10, x24\n\t" - "umulh x26, x10, x24\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x11, x23\n\t" - "umulh x26, x11, x23\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x11, x24\n\t" - "umulh x26, x11, x24\n\t" - "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" + "umulh x26, x15, x16\n\t" + "adcs x8, x8, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x15, x19\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x15, x19\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x4, x4, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x5, x5, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x16\n\t" - "adcs x6, x6, x17\n\t" - "adcs x7, x7, x19\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x7, x7, x26\n\t" + "umulh x27, x25, x11\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ + "mov x25, #19\n\t" "extr x27, x27, x7, #63\n\t" "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "umulh x8, x25, x8\n\t" + "mul x26, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "umulh x9, x25, x9\n\t" + "mul x26, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "umulh x10, x25, x10\n\t" "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #16]\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, x10\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + "sub x1, x0, #32\n\t" /* Add */ - "adds x8, x12, x4\n\t" - "adcs x9, x13, x5\n\t" - "adcs x10, x14, x6\n\t" - "adc x11, x15, x7\n\t" - "mov x25, #-19\n\t" - "asr x28, x11, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "adds x8, x21, x4\n\t" + "adcs x9, x22, x5\n\t" + "adcs x10, x23, x6\n\t" + "adcs x11, x24, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x11, #63\n\t" + "mul x25, x28, x25\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x25\n\t" - "sbcs x9, x9, x28\n\t" - "sbcs x10, x10, x28\n\t" - "sbc x11, x11, x26\n\t" + "adds x8, x8, x25\n\t" + "adcs x9, x9, xzr\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" /* Sub */ - "subs x16, x12, x4\n\t" - "sbcs x17, x13, x5\n\t" - "sbcs x19, x14, x6\n\t" - "sbcs x20, x15, x7\n\t" - "mov x25, #-19\n\t" + "subs x12, x21, x4\n\t" + "sbcs x13, x22, x5\n\t" + "sbcs x14, x23, x6\n\t" + "sbcs x15, x24, x7\n\t" "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" "stp x8, x9, [x0]\n\t" "stp x10, x11, [x0, #16]\n\t" - "stp x16, x17, [x1]\n\t" - "stp x19, x20, [x1, #16]\n\t" - "ldr x0, [x29, #40]\n\t" - "ldr x1, [sp, #88]\n\t" - "ldr x3, [x29, #72]\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #32]\n\t" + "add x2, x2, #0x40\n\t" + "add x1, x1, #0x60\n\t" + "add x0, x0, #0x40\n\t" /* Multiply */ - "ldp x16, x17, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x3]\n\t" - "ldp x23, x24, [x3, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x16, x21\n\t" - "umulh x5, x16, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x16, x22\n\t" - "umulh x6, x16, x22\n\t" - "adds x5, x5, x25\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x17, x21\n\t" - "umulh x26, x17, x21\n\t" - "adds x5, x5, x25\n\t" - "adcs x6, x6, x26\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x16, x23\n\t" - "umulh x26, x16, x23\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x17, x22\n\t" - "umulh x26, x17, x22\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x19, x21\n\t" - "umulh x26, x19, x21\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x16, x24\n\t" - "umulh x26, x16, x24\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x17, x23\n\t" - "umulh x26, x17, x23\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x19, x22\n\t" - "umulh x26, x19, x22\n\t" - "adds x7, x7, x25\n\t" + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x17, x21, x4\n\t" + "mul x16, x21, x4\n\t" + /* A[2] * B[0] */ + "umulh x20, x23, x4\n\t" + "mul x19, x23, x4\n\t" + /* A[1] * B[0] */ + "mul x25, x22, x4\n\t" + "adds x17, x17, x25\n\t" + "umulh x26, x22, x4\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x22, x7\n\t" + "mul x8, x22, x7\n\t" + /* A[0] * B[1] */ + "mul x25, x21, x5\n\t" + "adds x17, x17, x25\n\t" + "umulh x26, x21, x5\n\t" + "adcs x19, x19, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x23, x5\n\t" + "adcs x20, x20, x25\n\t" + "umulh x26, x23, x5\n\t" "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x20, x21\n\t" - "umulh x26, x20, x21\n\t" - "adds x7, x7, x25\n\t" + /* A[1] * B[2] */ + "mul x25, x22, x6\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x22, x6\n\t" "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x17, x24\n\t" - "umulh x26, x17, x24\n\t" - "adds x8, x8, x25\n\t" - "adcs x9, x9, x26\n\t" + "adcs x9, x9, xzr\n\t" "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x19, x23\n\t" - "umulh x26, x19, x23\n\t" - "adds x8, x8, x25\n\t" + /* A[0] * B[2] */ + "mul x25, x21, x6\n\t" + "adds x19, x19, x25\n\t" + "umulh x26, x21, x6\n\t" + "adcs x20, x20, x26\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x22, x5\n\t" + "adds x19, x19, x25\n\t" + "umulh x26, x22, x5\n\t" + "adcs x20, x20, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x24, x5\n\t" + "adcs x8, x8, x25\n\t" + "umulh x26, x24, x5\n\t" "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x20, x22\n\t" - "umulh x26, x20, x22\n\t" + /* A[2] * B[2] */ + "mul x25, x23, x6\n\t" "adds x8, x8, x25\n\t" + "umulh x26, x23, x6\n\t" "adcs x9, x9, x26\n\t" - "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x19, x24\n\t" - "umulh x26, x19, x24\n\t" - "adds x9, x9, x25\n\t" + /* A[3] * B[3] */ + "mul x25, x24, x7\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x24, x7\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x21, x7\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x21, x7\n\t" + "adcs x8, x8, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x23, x7\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x23, x7\n\t" "adcs x10, x10, x26\n\t" - "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x20, x23\n\t" - "umulh x26, x20, x23\n\t" - "adds x9, x9, x25\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x24, x4\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x24, x4\n\t" + "adcs x8, x8, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x24, x6\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x24, x6\n\t" "adcs x10, x10, x26\n\t" "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x20, x24\n\t" - "umulh x26, x20, x24\n\t" - "adds x10, x10, x25\n\t" - "adc x11, x11, x26\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x20, x20, x26\n\t" + "umulh x27, x25, x11\n\t" + "adc x27, x27, xzr\n\t" "mov x25, #19\n\t" + "extr x27, x27, x20, #63\n\t" + "mul x27, x27, x25\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" "mul x26, x25, x8\n\t" + "adds x16, x16, x26\n\t" "umulh x8, x25, x8\n\t" - "adds x4, x4, x26\n\t" "mul x26, x25, x9\n\t" + "adcs x17, x17, x26\n\t" "umulh x9, x25, x9\n\t" - "adcs x5, x5, x26\n\t" "mul x26, x25, x10\n\t" + "adcs x19, x19, x26\n\t" "umulh x10, x25, x10\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x11\n\t" - "umulh x27, x25, x11\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" - "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "ldr x0, [x29, #32]\n\t" - "ldr x1, [x29, #64]\n\t" + "adc x20, x20, xzr\n\t" + /* Add high product results in */ + "adds x16, x16, x27\n\t" + "adcs x17, x17, x8\n\t" + "adcs x19, x19, x9\n\t" + "adc x20, x20, x10\n\t" + "sub x1, x1, #32\n\t" /* Double */ - "ldp x8, x9, [x1]\n\t" - "ldp x10, x11, [x1, #16]\n\t" - "adds x8, x8, x8\n\t" - "adcs x9, x9, x9\n\t" - "adcs x10, x10, x10\n\t" - "adc x11, x11, x11\n\t" - "mov x25, #-19\n\t" - "asr x28, x11, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x8, x8, x25\n\t" - "sbcs x9, x9, x28\n\t" - "sbcs x10, x10, x28\n\t" - "sbc x11, x11, x26\n\t" - "ldr x1, [x29, #40]\n\t" - /* Add */ - "adds x12, x8, x4\n\t" - "adcs x13, x9, x5\n\t" - "adcs x14, x10, x6\n\t" - "adc x15, x11, x7\n\t" + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + "adds x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adc x15, x15, x15\n\t" "mov x25, #-19\n\t" "asr x28, x15, #63\n\t" /* Mask the modulus */ @@ -5790,1074 +6262,1112 @@ void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe p "sbcs x13, x13, x28\n\t" "sbcs x14, x14, x28\n\t" "sbc x15, x15, x26\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + "mov x1, x0\n\t" + "sub x0, x0, #32\n\t" + /* Add */ + "adds x8, x12, x16\n\t" + "adcs x9, x13, x17\n\t" + "adcs x10, x14, x19\n\t" + "adcs x11, x15, x20\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x11, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, xzr\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" /* Sub */ - "subs x16, x8, x4\n\t" - "sbcs x17, x9, x5\n\t" - "sbcs x19, x10, x6\n\t" - "sbcs x20, x11, x7\n\t" - "mov x25, #-19\n\t" + "subs x4, x12, x16\n\t" + "sbcs x5, x13, x17\n\t" + "sbcs x6, x14, x19\n\t" + "sbcs x7, x15, x20\n\t" "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x7, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" - "stp x12, x13, [x1]\n\t" - "stp x14, x15, [x1, #16]\n\t" - "stp x16, x17, [x0]\n\t" - "stp x19, x20, [x0, #16]\n\t" - "ldp x29, x30, [sp], #0x70\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qxy2d] "+r" (qxy2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx) + "subs x4, x4, x25\n\t" + "sbcs x5, x5, xzr\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "sbcs x6, x6, xzr\n\t" + "sbc x7, x7, xzr\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldp x29, x30, [sp], #48\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) : - : "memory", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } -void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) +void ge_msub(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) { __asm__ __volatile__ ( - "stp x29, x30, [sp, #-128]!\n\t" + "stp x29, x30, [sp, #-48]!\n\t" "add x29, sp, #0\n\t" - "str %x[qyminusx], [sp, #120]\n\t" - "str %x[qyplusx], [sp, #112]\n\t" - "str %x[qt2d], [sp, #104]\n\t" - "str %x[qz], [sp, #96]\n\t" - "str %x[rx], [x29, #16]\n\t" - "str %x[ry], [x29, #24]\n\t" - "str %x[rz], [x29, #32]\n\t" - "str %x[rt], [x29, #40]\n\t" - "str %x[px], [x29, #48]\n\t" - "str %x[py], [x29, #56]\n\t" - "str %x[pz], [x29, #64]\n\t" - "str %x[pt], [x29, #72]\n\t" - "ldr x2, [x29, #56]\n\t" - "ldr x3, [x29, #48]\n\t" + "str %w[r], [x29, #16]\n\t" + "str %w[p], [x29, #24]\n\t" + "str %w[q], [x29, #32]\n\t" + "mov x3, x1\n\t" + "add x2, x1, #32\n\t" + "add x1, x0, #32\n\t" /* Add */ - "ldp x12, x13, [x2]\n\t" - "ldp x14, x15, [x2, #16]\n\t" - "ldp x16, x17, [x3]\n\t" - "ldp x19, x20, [x3, #16]\n\t" - "adds x4, x12, x16\n\t" - "adcs x5, x13, x17\n\t" - "adcs x6, x14, x19\n\t" - "adc x7, x15, x20\n\t" - "mov x25, #-19\n\t" - "asr x28, x7, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "adds x16, x8, x4\n\t" + "adcs x17, x9, x5\n\t" + "adcs x19, x10, x6\n\t" + "adcs x20, x11, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x20, #63\n\t" + "mul x25, x28, x25\n\t" /* Sub modulus (if overflow) */ - "subs x4, x4, x25\n\t" - "sbcs x5, x5, x28\n\t" - "sbcs x6, x6, x28\n\t" - "sbc x7, x7, x26\n\t" - /* Sub */ - "subs x8, x12, x16\n\t" - "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x19\n\t" - "sbcs x11, x15, x20\n\t" - "mov x25, #-19\n\t" - "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x8, x8, x25\n\t" - "adcs x9, x9, x28\n\t" - "adcs x10, x10, x28\n\t" - "adc x11, x11, x26\n\t" - "ldr x0, [x29, #32]\n\t" - "ldr x2, [sp, #112]\n\t" - /* Multiply */ - "ldp x21, x22, [x2]\n\t" - "ldp x23, x24, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x12, x4, x21\n\t" - "umulh x13, x4, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x4, x22\n\t" - "umulh x14, x4, x22\n\t" - "adds x13, x13, x25\n\t" - "adc x14, x14, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x5, x21\n\t" - "umulh x26, x5, x21\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x4, x23\n\t" - "umulh x26, x4, x23\n\t" - "adds x14, x14, x25\n\t" - "adc x15, x15, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x5, x22\n\t" - "umulh x26, x5, x22\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x6, x21\n\t" - "umulh x26, x6, x21\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x4, x24\n\t" - "umulh x26, x4, x24\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x5, x23\n\t" - "umulh x26, x5, x23\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x6, x22\n\t" - "umulh x26, x6, x22\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x7, x21\n\t" - "umulh x26, x7, x21\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x5, x24\n\t" - "umulh x26, x5, x24\n\t" "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x6, x23\n\t" - "umulh x26, x6, x23\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x7, x22\n\t" - "umulh x26, x7, x22\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x6, x24\n\t" - "umulh x26, x6, x24\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x7, x23\n\t" - "umulh x26, x7, x23\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" + "adcs x17, x17, xzr\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adcs x19, x19, xzr\n\t" "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x7, x24\n\t" - "umulh x26, x7, x24\n\t" - "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x15, #63\n\t" + /* Sub */ + "subs x12, x8, x4\n\t" + "sbcs x13, x9, x5\n\t" + "sbcs x14, x10, x6\n\t" + "sbcs x15, x11, x7\n\t" + "csetm x28, cc\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" + /* Add modulus (if underflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" "and x15, x15, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x12, x12, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x13, x13, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x14, x14, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x15, x15, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x13, x13, x16\n\t" - "adcs x14, x14, x17\n\t" - "adcs x15, x15, x19\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "ldr x2, [x29, #32]\n\t" + "add x2, x2, #32\n\t" + "mov x1, x0\n\t" + /* Multiply */ + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x22, x16, x8\n\t" + "mul x21, x16, x8\n\t" + /* A[2] * B[0] */ + "umulh x24, x19, x8\n\t" + "mul x23, x19, x8\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x8\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x17, x8\n\t" + "adcs x23, x23, x26\n\t" + "adc x24, x24, xzr\n\t" + /* A[1] * B[3] */ + "umulh x5, x17, x11\n\t" + "mul x4, x17, x11\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x9\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x16, x9\n\t" + "adcs x23, x23, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x9\n\t" + "adcs x24, x24, x25\n\t" + "umulh x26, x19, x9\n\t" + "adcs x4, x4, x26\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x10\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x17, x10\n\t" + "adcs x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x10\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x16, x10\n\t" + "adcs x24, x24, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x9\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x17, x9\n\t" + "adcs x24, x24, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x9\n\t" + "adcs x4, x4, x25\n\t" + "umulh x26, x20, x9\n\t" + "adcs x5, x5, x26\n\t" + "adc x6, x6, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x10\n\t" + "adds x4, x4, x25\n\t" + "umulh x26, x19, x10\n\t" + "adcs x5, x5, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x11\n\t" + "adcs x6, x6, x25\n\t" + "umulh x7, x20, x11\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x11\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x16, x11\n\t" + "adcs x4, x4, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x11\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x19, x11\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x8\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x20, x8\n\t" + "adcs x4, x4, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x10\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x20, x10\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce */ + "mov x25, #38\n\t" + "mul x26, x25, x7\n\t" + "adds x24, x24, x26\n\t" + "umulh x27, x25, x7\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x15, #63\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x24, #63\n\t" "mul x27, x27, x25\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x15, asr 63\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" - /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [sp, #120]\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x4\n\t" + "adds x21, x21, x26\n\t" + "umulh x4, x25, x4\n\t" + "mul x26, x25, x5\n\t" + "adcs x22, x22, x26\n\t" + "umulh x5, x25, x5\n\t" + "mul x26, x25, x6\n\t" + "adcs x23, x23, x26\n\t" + "umulh x6, x25, x6\n\t" + "adc x24, x24, xzr\n\t" + /* Add high product results in */ + "adds x21, x21, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x23, x23, x5\n\t" + "adc x24, x24, x6\n\t" + "sub x2, x2, #32\n\t" + "add x1, x0, #32\n\t" + "add x0, x0, #32\n\t" /* Multiply */ - "ldp x21, x22, [x1]\n\t" - "ldp x23, x24, [x1, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x8, x21\n\t" - "umulh x5, x8, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x8, x22\n\t" - "umulh x6, x8, x22\n\t" + "ldp x16, x17, [x2]\n\t" + "ldp x19, x20, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x5, x12, x16\n\t" + "mul x4, x12, x16\n\t" + /* A[2] * B[0] */ + "umulh x7, x14, x16\n\t" + "mul x6, x14, x16\n\t" + /* A[1] * B[0] */ + "mul x25, x13, x16\n\t" "adds x5, x5, x25\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x9, x21\n\t" - "umulh x26, x9, x21\n\t" + "umulh x26, x13, x16\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x13, x20\n\t" + "mul x8, x13, x20\n\t" + /* A[0] * B[1] */ + "mul x25, x12, x17\n\t" "adds x5, x5, x25\n\t" + "umulh x26, x12, x17\n\t" "adcs x6, x6, x26\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x8, x23\n\t" - "umulh x26, x8, x23\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x9, x22\n\t" - "umulh x26, x9, x22\n\t" + /* A[2] * B[1] */ + "mul x25, x14, x17\n\t" + "adcs x7, x7, x25\n\t" + "umulh x26, x14, x17\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x13, x19\n\t" + "adds x7, x7, x25\n\t" + "umulh x26, x13, x19\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x12, x19\n\t" "adds x6, x6, x25\n\t" + "umulh x26, x12, x19\n\t" "adcs x7, x7, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x10, x21\n\t" - "umulh x26, x10, x21\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x13, x17\n\t" "adds x6, x6, x25\n\t" + "umulh x26, x13, x17\n\t" "adcs x7, x7, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x8, x24\n\t" - "umulh x26, x8, x24\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x9, x23\n\t" - "umulh x26, x9, x23\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x10, x22\n\t" - "umulh x26, x10, x22\n\t" + /* A[3] * B[1] */ + "mul x25, x15, x17\n\t" + "adcs x8, x8, x25\n\t" + "umulh x26, x15, x17\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "umulh x26, x14, x19\n\t" + "adcs x9, x9, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x15, x20\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x15, x20\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x12, x20\n\t" "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x11, x21\n\t" - "umulh x26, x11, x21\n\t" + "umulh x26, x12, x20\n\t" + "adcs x8, x8, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x14, x20\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x14, x20\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x15, x16\n\t" "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x9, x24\n\t" - "umulh x26, x9, x24\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x10, x23\n\t" - "umulh x26, x10, x23\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x11, x22\n\t" - "umulh x26, x11, x22\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x10, x24\n\t" - "umulh x26, x10, x24\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x11, x23\n\t" - "umulh x26, x11, x23\n\t" - "adds x17, x17, x25\n\t" - "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x11, x24\n\t" - "umulh x26, x11, x24\n\t" - "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" + "umulh x26, x15, x16\n\t" + "adcs x8, x8, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x15, x19\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x15, x19\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x4, x4, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x5, x5, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x16\n\t" - "adcs x6, x6, x17\n\t" - "adcs x7, x7, x19\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x7, x7, x26\n\t" + "umulh x27, x25, x11\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ + "mov x25, #19\n\t" "extr x27, x27, x7, #63\n\t" "mul x27, x27, x25\n\t" "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "umulh x8, x25, x8\n\t" + "mul x26, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "umulh x9, x25, x9\n\t" + "mul x26, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "umulh x10, x25, x10\n\t" "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #16]\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, x10\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + "sub x1, x0, #32\n\t" /* Add */ - "adds x8, x12, x4\n\t" - "adcs x9, x13, x5\n\t" - "adcs x10, x14, x6\n\t" - "adc x11, x15, x7\n\t" + "adds x8, x21, x4\n\t" + "adcs x9, x22, x5\n\t" + "adcs x10, x23, x6\n\t" + "adcs x11, x24, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x11, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, xzr\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Sub */ + "subs x12, x21, x4\n\t" + "sbcs x13, x22, x5\n\t" + "sbcs x14, x23, x6\n\t" + "sbcs x15, x24, x7\n\t" + "csetm x28, cc\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" + /* Add modulus (if underflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #32]\n\t" + "add x2, x2, #0x40\n\t" + "add x1, x1, #0x60\n\t" + "add x0, x0, #0x40\n\t" + /* Multiply */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x17, x21, x4\n\t" + "mul x16, x21, x4\n\t" + /* A[2] * B[0] */ + "umulh x20, x23, x4\n\t" + "mul x19, x23, x4\n\t" + /* A[1] * B[0] */ + "mul x25, x22, x4\n\t" + "adds x17, x17, x25\n\t" + "umulh x26, x22, x4\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x22, x7\n\t" + "mul x8, x22, x7\n\t" + /* A[0] * B[1] */ + "mul x25, x21, x5\n\t" + "adds x17, x17, x25\n\t" + "umulh x26, x21, x5\n\t" + "adcs x19, x19, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x23, x5\n\t" + "adcs x20, x20, x25\n\t" + "umulh x26, x23, x5\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x22, x6\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x22, x6\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x21, x6\n\t" + "adds x19, x19, x25\n\t" + "umulh x26, x21, x6\n\t" + "adcs x20, x20, x26\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x22, x5\n\t" + "adds x19, x19, x25\n\t" + "umulh x26, x22, x5\n\t" + "adcs x20, x20, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x24, x5\n\t" + "adcs x8, x8, x25\n\t" + "umulh x26, x24, x5\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x23, x6\n\t" + "adds x8, x8, x25\n\t" + "umulh x26, x23, x6\n\t" + "adcs x9, x9, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x24, x7\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x24, x7\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x21, x7\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x21, x7\n\t" + "adcs x8, x8, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x23, x7\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x23, x7\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x24, x4\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x24, x4\n\t" + "adcs x8, x8, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x24, x6\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x24, x6\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce */ + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x20, x20, x26\n\t" + "umulh x27, x25, x11\n\t" + "adc x27, x27, xzr\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x20, #63\n\t" + "mul x27, x27, x25\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x8\n\t" + "adds x16, x16, x26\n\t" + "umulh x8, x25, x8\n\t" + "mul x26, x25, x9\n\t" + "adcs x17, x17, x26\n\t" + "umulh x9, x25, x9\n\t" + "mul x26, x25, x10\n\t" + "adcs x19, x19, x26\n\t" + "umulh x10, x25, x10\n\t" + "adc x20, x20, xzr\n\t" + /* Add high product results in */ + "adds x16, x16, x27\n\t" + "adcs x17, x17, x8\n\t" + "adcs x19, x19, x9\n\t" + "adc x20, x20, x10\n\t" + "sub x1, x1, #32\n\t" + /* Double */ + "ldp x12, x13, [x1]\n\t" + "ldp x14, x15, [x1, #16]\n\t" + "adds x12, x12, x12\n\t" + "adcs x13, x13, x13\n\t" + "adcs x14, x14, x14\n\t" + "adc x15, x15, x15\n\t" "mov x25, #-19\n\t" - "asr x28, x11, #63\n\t" + "asr x28, x15, #63\n\t" /* Mask the modulus */ "and x25, x28, x25\n\t" "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x8, x8, x25\n\t" - "sbcs x9, x9, x28\n\t" - "sbcs x10, x10, x28\n\t" - "sbc x11, x11, x26\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, x28\n\t" + "sbcs x14, x14, x28\n\t" + "sbc x15, x15, x26\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + "sub x1, x0, #32\n\t" + /* Add */ + "adds x8, x12, x16\n\t" + "adcs x9, x13, x17\n\t" + "adcs x10, x14, x19\n\t" + "adcs x11, x15, x20\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x11, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, xzr\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" /* Sub */ - "subs x16, x12, x4\n\t" - "sbcs x17, x13, x5\n\t" - "sbcs x19, x14, x6\n\t" - "sbcs x20, x15, x7\n\t" - "mov x25, #-19\n\t" + "subs x4, x12, x16\n\t" + "sbcs x5, x13, x17\n\t" + "sbcs x6, x14, x19\n\t" + "sbcs x7, x15, x20\n\t" "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x7, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" + "subs x4, x4, x25\n\t" + "sbcs x5, x5, xzr\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "sbcs x6, x6, xzr\n\t" + "sbc x7, x7, xzr\n\t" "stp x8, x9, [x0]\n\t" "stp x10, x11, [x0, #16]\n\t" - "stp x16, x17, [x1]\n\t" - "stp x19, x20, [x1, #16]\n\t" - "ldr x0, [x29, #48]\n\t" - "ldr x1, [x29, #64]\n\t" - "ldr x2, [sp, #96]\n\t" + "stp x4, x5, [x1]\n\t" + "stp x6, x7, [x1, #16]\n\t" + "ldp x29, x30, [sp], #48\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +void ge_add(ge_p1p1* r, const ge_p3* p, const ge_cached* q) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-48]!\n\t" + "add x29, sp, #0\n\t" + "str %w[r], [x29, #16]\n\t" + "str %w[p], [x29, #24]\n\t" + "str %w[q], [x29, #32]\n\t" + "mov x3, x1\n\t" + "add x2, x1, #32\n\t" + "add x1, x0, #32\n\t" + /* Add */ + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "adds x16, x8, x4\n\t" + "adcs x17, x9, x5\n\t" + "adcs x19, x10, x6\n\t" + "adcs x20, x11, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x20, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x16, x16, x25\n\t" + "adcs x17, x17, xzr\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" + /* Sub */ + "subs x12, x8, x4\n\t" + "sbcs x13, x9, x5\n\t" + "sbcs x14, x10, x6\n\t" + "sbcs x15, x11, x7\n\t" + "csetm x28, cc\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" + /* Add modulus (if underflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "ldr x2, [x29, #32]\n\t" + "mov x1, x0\n\t" + /* Multiply */ + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x22, x16, x8\n\t" + "mul x21, x16, x8\n\t" + /* A[2] * B[0] */ + "umulh x24, x19, x8\n\t" + "mul x23, x19, x8\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x8\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x17, x8\n\t" + "adcs x23, x23, x26\n\t" + "adc x24, x24, xzr\n\t" + /* A[1] * B[3] */ + "umulh x5, x17, x11\n\t" + "mul x4, x17, x11\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x9\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x16, x9\n\t" + "adcs x23, x23, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x9\n\t" + "adcs x24, x24, x25\n\t" + "umulh x26, x19, x9\n\t" + "adcs x4, x4, x26\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x10\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x17, x10\n\t" + "adcs x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x10\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x16, x10\n\t" + "adcs x24, x24, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x9\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x17, x9\n\t" + "adcs x24, x24, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x9\n\t" + "adcs x4, x4, x25\n\t" + "umulh x26, x20, x9\n\t" + "adcs x5, x5, x26\n\t" + "adc x6, x6, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x10\n\t" + "adds x4, x4, x25\n\t" + "umulh x26, x19, x10\n\t" + "adcs x5, x5, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x11\n\t" + "adcs x6, x6, x25\n\t" + "umulh x7, x20, x11\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x11\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x16, x11\n\t" + "adcs x4, x4, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x11\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x19, x11\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x8\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x20, x8\n\t" + "adcs x4, x4, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x10\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x20, x10\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce */ + "mov x25, #38\n\t" + "mul x26, x25, x7\n\t" + "adds x24, x24, x26\n\t" + "umulh x27, x25, x7\n\t" + "adc x27, x27, xzr\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x24, #63\n\t" + "mul x27, x27, x25\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x4\n\t" + "adds x21, x21, x26\n\t" + "umulh x4, x25, x4\n\t" + "mul x26, x25, x5\n\t" + "adcs x22, x22, x26\n\t" + "umulh x5, x25, x5\n\t" + "mul x26, x25, x6\n\t" + "adcs x23, x23, x26\n\t" + "umulh x6, x25, x6\n\t" + "adc x24, x24, xzr\n\t" + /* Add high product results in */ + "adds x21, x21, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x23, x23, x5\n\t" + "adc x24, x24, x6\n\t" + /* Store */ + "stp x21, x22, [x0]\n\t" + "stp x23, x24, [x0, #16]\n\t" + "add x2, x2, #32\n\t" + "add x1, x0, #32\n\t" + "add x0, x0, #32\n\t" /* Multiply */ - "ldp x12, x13, [x1]\n\t" - "ldp x14, x15, [x1, #16]\n\t" "ldp x16, x17, [x2]\n\t" "ldp x19, x20, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x12, x16\n\t" + /* A[0] * B[0] */ "umulh x5, x12, x16\n\t" - /* A[0] * B[1] */ - "mul x25, x12, x17\n\t" - "umulh x6, x12, x17\n\t" - "adds x5, x5, x25\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ + "mul x4, x12, x16\n\t" + /* A[2] * B[0] */ + "umulh x7, x14, x16\n\t" + "mul x6, x14, x16\n\t" + /* A[1] * B[0] */ "mul x25, x13, x16\n\t" + "adds x5, x5, x25\n\t" "umulh x26, x13, x16\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x13, x20\n\t" + "mul x8, x13, x20\n\t" + /* A[0] * B[1] */ + "mul x25, x12, x17\n\t" "adds x5, x5, x25\n\t" + "umulh x26, x12, x17\n\t" "adcs x6, x6, x26\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x12, x19\n\t" - "umulh x26, x12, x19\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x13, x17\n\t" - "umulh x26, x13, x17\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x14, x16\n\t" - "umulh x26, x14, x16\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x12, x20\n\t" - "umulh x26, x12, x20\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x13, x19\n\t" - "umulh x26, x13, x19\n\t" - "adds x7, x7, x25\n\t" - "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ + /* A[2] * B[1] */ "mul x25, x14, x17\n\t" + "adcs x7, x7, x25\n\t" "umulh x26, x14, x17\n\t" - "adds x7, x7, x25\n\t" "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x15, x16\n\t" - "umulh x26, x15, x16\n\t" + /* A[1] * B[2] */ + "mul x25, x13, x19\n\t" "adds x7, x7, x25\n\t" + "umulh x26, x13, x19\n\t" "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x13, x20\n\t" - "umulh x26, x13, x20\n\t" - "adds x8, x8, x25\n\t" - "adcs x9, x9, x26\n\t" + "adcs x9, x9, xzr\n\t" "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x14, x19\n\t" - "umulh x26, x14, x19\n\t" - "adds x8, x8, x25\n\t" - "adcs x9, x9, x26\n\t" + /* A[0] * B[2] */ + "mul x25, x12, x19\n\t" + "adds x6, x6, x25\n\t" + "umulh x26, x12, x19\n\t" + "adcs x7, x7, x26\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ + /* A[1] * B[1] */ + "mul x25, x13, x17\n\t" + "adds x6, x6, x25\n\t" + "umulh x26, x13, x17\n\t" + "adcs x7, x7, x26\n\t" + /* A[3] * B[1] */ "mul x25, x15, x17\n\t" + "adcs x8, x8, x25\n\t" "umulh x26, x15, x17\n\t" - "adds x8, x8, x25\n\t" "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ + /* A[2] * B[2] */ + "mul x25, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "umulh x26, x14, x19\n\t" + "adcs x9, x9, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x15, x20\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x15, x20\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x12, x20\n\t" + "adds x7, x7, x25\n\t" + "umulh x26, x12, x20\n\t" + "adcs x8, x8, x26\n\t" + /* A[2] * B[3] */ "mul x25, x14, x20\n\t" + "adcs x9, x9, x25\n\t" "umulh x26, x14, x20\n\t" - "adds x9, x9, x25\n\t" - "adcs x10, x10, x26\n\t" - "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x15, x19\n\t" - "umulh x26, x15, x19\n\t" - "adds x9, x9, x25\n\t" "adcs x10, x10, x26\n\t" "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x15, x20\n\t" - "umulh x26, x15, x20\n\t" - "adds x10, x10, x25\n\t" - "adc x11, x11, x26\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x8\n\t" - "umulh x8, x25, x8\n\t" - "adds x4, x4, x26\n\t" - "mul x26, x25, x9\n\t" - "umulh x9, x25, x9\n\t" - "adcs x5, x5, x26\n\t" - "mul x26, x25, x10\n\t" - "umulh x10, x25, x10\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x11\n\t" - "umulh x27, x25, x11\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" - "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "ldr x0, [x29, #48]\n\t" - /* Double */ - "adds x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "mov x25, #-19\n\t" - "asr x28, x7, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x4, x4, x25\n\t" - "sbcs x5, x5, x28\n\t" - "sbcs x6, x6, x28\n\t" - "sbc x7, x7, x26\n\t" - "ldr x0, [x29, #40]\n\t" - "ldr x1, [sp, #104]\n\t" - "ldr x2, [x29, #72]\n\t" - /* Multiply */ - "ldp x16, x17, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x2]\n\t" - "ldp x23, x24, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x8, x16, x21\n\t" - "umulh x9, x16, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x16, x22\n\t" - "umulh x10, x16, x22\n\t" - "adds x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x17, x21\n\t" - "umulh x26, x17, x21\n\t" - "adds x9, x9, x25\n\t" - "adcs x10, x10, x26\n\t" - "adc x11, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x16, x23\n\t" - "umulh x26, x16, x23\n\t" - "adds x10, x10, x25\n\t" - "adc x11, x11, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x17, x22\n\t" - "umulh x26, x17, x22\n\t" - "adds x10, x10, x25\n\t" - "adcs x11, x11, x26\n\t" - "adc x12, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x19, x21\n\t" - "umulh x26, x19, x21\n\t" - "adds x10, x10, x25\n\t" - "adcs x11, x11, x26\n\t" - "adc x12, x12, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x16, x24\n\t" - "umulh x26, x16, x24\n\t" - "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x17, x23\n\t" - "umulh x26, x17, x23\n\t" - "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, x13, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x19, x22\n\t" - "umulh x26, x19, x22\n\t" - "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, x13, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x20, x21\n\t" - "umulh x26, x20, x21\n\t" - "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, x13, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x17, x24\n\t" - "umulh x26, x17, x24\n\t" - "adds x12, x12, x25\n\t" - "adcs x13, x13, x26\n\t" - "adc x14, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x19, x23\n\t" - "umulh x26, x19, x23\n\t" - "adds x12, x12, x25\n\t" - "adcs x13, x13, x26\n\t" - "adc x14, x14, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x20, x22\n\t" - "umulh x26, x20, x22\n\t" - "adds x12, x12, x25\n\t" - "adcs x13, x13, x26\n\t" - "adc x14, x14, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x19, x24\n\t" - "umulh x26, x19, x24\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x20, x23\n\t" - "umulh x26, x20, x23\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, x15, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x20, x24\n\t" - "umulh x26, x20, x24\n\t" - "adds x14, x14, x25\n\t" - "adc x15, x15, x26\n\t" - /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x15, x15, x14, #63\n\t" - "extr x14, x14, x13, #63\n\t" - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x12\n\t" - "umulh x12, x25, x12\n\t" - "adds x8, x8, x26\n\t" - "mul x26, x25, x13\n\t" - "umulh x13, x25, x13\n\t" - "adcs x9, x9, x26\n\t" - "mul x26, x25, x14\n\t" - "umulh x14, x25, x14\n\t" + /* A[3] * B[0] */ + "mul x25, x15, x16\n\t" + "adds x7, x7, x25\n\t" + "umulh x26, x15, x16\n\t" + "adcs x8, x8, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x15, x19\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x15, x19\n\t" "adcs x10, x10, x26\n\t" - "mul x26, x25, x15\n\t" - "umulh x27, x25, x15\n\t" - "adcs x11, x11, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adcs x11, x11, x14\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce */ + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x7, x7, x26\n\t" + "umulh x27, x25, x11\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x11, #63\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x7, #63\n\t" "mul x27, x27, x25\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x27\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "umulh x8, x25, x8\n\t" + "mul x26, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "umulh x9, x25, x9\n\t" + "mul x26, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "umulh x10, x25, x10\n\t" + "adc x7, x7, xzr\n\t" + /* Add high product results in */ + "adds x4, x4, x27\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, x10\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + "sub x1, x0, #32\n\t" + /* Add */ + "adds x8, x21, x4\n\t" + "adcs x9, x22, x5\n\t" + "adcs x10, x23, x6\n\t" + "adcs x11, x24, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x11, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x8, x8, x25\n\t" "adcs x9, x9, xzr\n\t" - "adcs x10, x10, xzr\n\t" - "adc x11, x11, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x11, asr 63\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x27\n\t" - "adcs x9, x9, xzr\n\t" "adcs x10, x10, xzr\n\t" "adc x11, x11, xzr\n\t" - /* Store */ - "ldr x0, [x29, #32]\n\t" - "ldr x1, [x29, #40]\n\t" - /* Add */ - "adds x12, x4, x8\n\t" - "adcs x13, x5, x9\n\t" - "adcs x14, x6, x10\n\t" - "adc x15, x7, x11\n\t" - "mov x25, #-19\n\t" - "asr x28, x15, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x12, x12, x25\n\t" - "sbcs x13, x13, x28\n\t" - "sbcs x14, x14, x28\n\t" - "sbc x15, x15, x26\n\t" /* Sub */ - "subs x16, x4, x8\n\t" - "sbcs x17, x5, x9\n\t" - "sbcs x19, x6, x10\n\t" - "sbcs x20, x7, x11\n\t" - "mov x25, #-19\n\t" + "subs x12, x21, x4\n\t" + "sbcs x13, x22, x5\n\t" + "sbcs x14, x23, x6\n\t" + "sbcs x15, x24, x7\n\t" "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" - "stp x12, x13, [x0]\n\t" - "stp x14, x15, [x0, #16]\n\t" - "stp x16, x17, [x1]\n\t" - "stp x19, x20, [x1, #16]\n\t" - "ldp x29, x30, [sp], #0x80\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qz] "+r" (qz), [qt2d] "+r" (qt2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx) - : - : "memory", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" - ); -} - -void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx) -{ - __asm__ __volatile__ ( - "stp x29, x30, [sp, #-128]!\n\t" - "add x29, sp, #0\n\t" - "str %x[qyminusx], [sp, #120]\n\t" - "str %x[qyplusx], [sp, #112]\n\t" - "str %x[qt2d], [sp, #104]\n\t" - "str %x[qz], [sp, #96]\n\t" - "str %x[rx], [x29, #16]\n\t" - "str %x[ry], [x29, #24]\n\t" - "str %x[rz], [x29, #32]\n\t" - "str %x[rt], [x29, #40]\n\t" - "str %x[px], [x29, #48]\n\t" - "str %x[py], [x29, #56]\n\t" - "str %x[pz], [x29, #64]\n\t" - "str %x[pt], [x29, #72]\n\t" - "ldr x2, [x29, #56]\n\t" - "ldr x3, [x29, #48]\n\t" - /* Add */ - "ldp x12, x13, [x2]\n\t" - "ldp x14, x15, [x2, #16]\n\t" - "ldp x16, x17, [x3]\n\t" - "ldp x19, x20, [x3, #16]\n\t" - "adds x4, x12, x16\n\t" - "adcs x5, x13, x17\n\t" - "adcs x6, x14, x19\n\t" - "adc x7, x15, x20\n\t" - "mov x25, #-19\n\t" - "asr x28, x7, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x4, x4, x25\n\t" - "sbcs x5, x5, x28\n\t" - "sbcs x6, x6, x28\n\t" - "sbc x7, x7, x26\n\t" - /* Sub */ - "subs x8, x12, x16\n\t" - "sbcs x9, x13, x17\n\t" - "sbcs x10, x14, x19\n\t" - "sbcs x11, x15, x20\n\t" "mov x25, #-19\n\t" - "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x8, x8, x25\n\t" - "adcs x9, x9, x28\n\t" - "adcs x10, x10, x28\n\t" - "adc x11, x11, x26\n\t" - "ldr x0, [x29, #32]\n\t" - "ldr x2, [sp, #120]\n\t" + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #32]\n\t" + "add x2, x2, #0x60\n\t" + "add x1, x1, #0x60\n\t" + "add x0, x0, #0x40\n\t" /* Multiply */ - "ldp x21, x22, [x2]\n\t" - "ldp x23, x24, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x12, x4, x21\n\t" - "umulh x13, x4, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x4, x22\n\t" - "umulh x14, x4, x22\n\t" - "adds x13, x13, x25\n\t" - "adc x14, x14, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x5, x21\n\t" - "umulh x26, x5, x21\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x4, x23\n\t" - "umulh x26, x4, x23\n\t" - "adds x14, x14, x25\n\t" - "adc x15, x15, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x5, x22\n\t" - "umulh x26, x5, x22\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x6, x21\n\t" - "umulh x26, x6, x21\n\t" - "adds x14, x14, x25\n\t" - "adcs x15, x15, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x4, x24\n\t" - "umulh x26, x4, x24\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x5, x23\n\t" - "umulh x26, x5, x23\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x6, x22\n\t" - "umulh x26, x6, x22\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x7, x21\n\t" - "umulh x26, x7, x21\n\t" - "adds x15, x15, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x5, x24\n\t" - "umulh x26, x5, x24\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x6, x23\n\t" - "umulh x26, x6, x23\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x7, x22\n\t" - "umulh x26, x7, x22\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x6, x24\n\t" - "umulh x26, x6, x24\n\t" + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x17, x21, x4\n\t" + "mul x16, x21, x4\n\t" + /* A[2] * B[0] */ + "umulh x20, x23, x4\n\t" + "mul x19, x23, x4\n\t" + /* A[1] * B[0] */ + "mul x25, x22, x4\n\t" "adds x17, x17, x25\n\t" + "umulh x26, x22, x4\n\t" "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x7, x23\n\t" - "umulh x26, x7, x23\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x22, x7\n\t" + "mul x8, x22, x7\n\t" + /* A[0] * B[1] */ + "mul x25, x21, x5\n\t" "adds x17, x17, x25\n\t" + "umulh x26, x21, x5\n\t" "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x7, x24\n\t" - "umulh x26, x7, x24\n\t" + /* A[2] * B[1] */ + "mul x25, x23, x5\n\t" + "adcs x20, x20, x25\n\t" + "umulh x26, x23, x5\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x22, x6\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x22, x6\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x21, x6\n\t" "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" + "umulh x26, x21, x6\n\t" + "adcs x20, x20, x26\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x22, x5\n\t" + "adds x19, x19, x25\n\t" + "umulh x26, x22, x5\n\t" + "adcs x20, x20, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x24, x5\n\t" + "adcs x8, x8, x25\n\t" + "umulh x26, x24, x5\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x23, x6\n\t" + "adds x8, x8, x25\n\t" + "umulh x26, x23, x6\n\t" + "adcs x9, x9, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x24, x7\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x24, x7\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x21, x7\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x21, x7\n\t" + "adcs x8, x8, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x23, x7\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x23, x7\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x24, x4\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x24, x4\n\t" + "adcs x8, x8, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x24, x6\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x24, x6\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x15, #63\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x16\n\t" - "umulh x16, x25, x16\n\t" - "adds x12, x12, x26\n\t" - "mul x26, x25, x17\n\t" - "umulh x17, x25, x17\n\t" - "adcs x13, x13, x26\n\t" - "mul x26, x25, x19\n\t" - "umulh x19, x25, x19\n\t" - "adcs x14, x14, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x15, x15, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x13, x13, x16\n\t" - "adcs x14, x14, x17\n\t" - "adcs x15, x15, x19\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x20, x20, x26\n\t" + "umulh x27, x25, x11\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x15, #63\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x20, #63\n\t" "mul x27, x27, x25\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x15, asr 63\n\t" - "and x15, x15, #0x7fffffffffffffff\n\t" - "adds x12, x12, x27\n\t" - "adcs x13, x13, xzr\n\t" - "adcs x14, x14, xzr\n\t" - "adc x15, x15, xzr\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x8\n\t" + "adds x16, x16, x26\n\t" + "umulh x8, x25, x8\n\t" + "mul x26, x25, x9\n\t" + "adcs x17, x17, x26\n\t" + "umulh x9, x25, x9\n\t" + "mul x26, x25, x10\n\t" + "adcs x19, x19, x26\n\t" + "umulh x10, x25, x10\n\t" + "adc x20, x20, xzr\n\t" + /* Add high product results in */ + "adds x16, x16, x27\n\t" + "adcs x17, x17, x8\n\t" + "adcs x19, x19, x9\n\t" + "adc x20, x20, x10\n\t" /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [sp, #112]\n\t" + "stp x16, x17, [x0]\n\t" + "stp x19, x20, [x0, #16]\n\t" + "sub x3, x2, #32\n\t" + "sub x2, x1, #32\n\t" + "sub x1, x0, #32\n\t" /* Multiply */ - "ldp x21, x22, [x1]\n\t" - "ldp x23, x24, [x1, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x8, x21\n\t" - "umulh x5, x8, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x8, x22\n\t" - "umulh x6, x8, x22\n\t" - "adds x5, x5, x25\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x9, x21\n\t" - "umulh x26, x9, x21\n\t" - "adds x5, x5, x25\n\t" - "adcs x6, x6, x26\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x8, x23\n\t" - "umulh x26, x8, x23\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x9, x22\n\t" - "umulh x26, x9, x22\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x16, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x10, x21\n\t" - "umulh x26, x10, x21\n\t" - "adds x6, x6, x25\n\t" - "adcs x7, x7, x26\n\t" - "adc x16, x16, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x8, x24\n\t" - "umulh x26, x8, x24\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x9, x23\n\t" - "umulh x26, x9, x23\n\t" - "adds x7, x7, x25\n\t" - "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x10, x22\n\t" - "umulh x26, x10, x22\n\t" - "adds x7, x7, x25\n\t" + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x12, x13, [x3]\n\t" + "ldp x14, x15, [x3, #16]\n\t" + /* A[0] * B[0] */ + "umulh x9, x4, x12\n\t" + "mul x8, x4, x12\n\t" + /* A[2] * B[0] */ + "umulh x11, x6, x12\n\t" + "mul x10, x6, x12\n\t" + /* A[1] * B[0] */ + "mul x25, x5, x12\n\t" + "adds x9, x9, x25\n\t" + "umulh x26, x5, x12\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "umulh x17, x5, x15\n\t" + "mul x16, x5, x15\n\t" + /* A[0] * B[1] */ + "mul x25, x4, x13\n\t" + "adds x9, x9, x25\n\t" + "umulh x26, x4, x13\n\t" + "adcs x10, x10, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x6, x13\n\t" + "adcs x11, x11, x25\n\t" + "umulh x26, x6, x13\n\t" "adcs x16, x16, x26\n\t" "adc x17, x17, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x11, x21\n\t" - "umulh x26, x11, x21\n\t" - "adds x7, x7, x25\n\t" + /* A[1] * B[2] */ + "mul x25, x5, x14\n\t" + "adds x11, x11, x25\n\t" + "umulh x26, x5, x14\n\t" "adcs x16, x16, x26\n\t" - "adc x17, x17, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x9, x24\n\t" - "umulh x26, x9, x24\n\t" - "adds x16, x16, x25\n\t" - "adcs x17, x17, x26\n\t" + "adcs x17, x17, xzr\n\t" "adc x19, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x10, x23\n\t" - "umulh x26, x10, x23\n\t" - "adds x16, x16, x25\n\t" + /* A[0] * B[2] */ + "mul x25, x4, x14\n\t" + "adds x10, x10, x25\n\t" + "umulh x26, x4, x14\n\t" + "adcs x11, x11, x26\n\t" + "adcs x16, x16, xzr\n\t" + "adcs x17, x17, xzr\n\t" + "adc x19, x19, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x5, x13\n\t" + "adds x10, x10, x25\n\t" + "umulh x26, x5, x13\n\t" + "adcs x11, x11, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x13\n\t" + "adcs x16, x16, x25\n\t" + "umulh x26, x7, x13\n\t" "adcs x17, x17, x26\n\t" "adc x19, x19, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x11, x22\n\t" - "umulh x26, x11, x22\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x14\n\t" "adds x16, x16, x25\n\t" + "umulh x26, x6, x14\n\t" "adcs x17, x17, x26\n\t" - "adc x19, x19, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x10, x24\n\t" - "umulh x26, x10, x24\n\t" - "adds x17, x17, x25\n\t" + /* A[3] * B[3] */ + "mul x25, x7, x15\n\t" + "adcs x19, x19, x25\n\t" + "umulh x20, x7, x15\n\t" + "adc x20, x20, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x4, x15\n\t" + "adds x11, x11, x25\n\t" + "umulh x26, x4, x15\n\t" + "adcs x16, x16, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x15\n\t" + "adcs x17, x17, x25\n\t" + "umulh x26, x6, x15\n\t" "adcs x19, x19, x26\n\t" - "adc x20, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x11, x23\n\t" - "umulh x26, x11, x23\n\t" - "adds x17, x17, x25\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x7, x12\n\t" + "adds x11, x11, x25\n\t" + "umulh x26, x7, x12\n\t" + "adcs x16, x16, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x14\n\t" + "adcs x17, x17, x25\n\t" + "umulh x26, x7, x14\n\t" "adcs x19, x19, x26\n\t" "adc x20, x20, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x11, x24\n\t" - "umulh x26, x11, x24\n\t" - "adds x19, x19, x25\n\t" - "adc x20, x20, x26\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x20, x20, x19, #63\n\t" - "extr x19, x19, x17, #63\n\t" - "extr x17, x17, x16, #63\n\t" - "extr x16, x16, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x25, #38\n\t" + "mul x26, x25, x20\n\t" + "adds x11, x11, x26\n\t" + "umulh x27, x25, x20\n\t" + "adc x27, x27, xzr\n\t" "mov x25, #19\n\t" + "extr x27, x27, x11, #63\n\t" + "mul x27, x27, x25\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" "mul x26, x25, x16\n\t" + "adds x8, x8, x26\n\t" "umulh x16, x25, x16\n\t" - "adds x4, x4, x26\n\t" "mul x26, x25, x17\n\t" + "adcs x9, x9, x26\n\t" "umulh x17, x25, x17\n\t" - "adcs x5, x5, x26\n\t" "mul x26, x25, x19\n\t" + "adcs x10, x10, x26\n\t" "umulh x19, x25, x19\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x20\n\t" - "umulh x27, x25, x20\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x16\n\t" - "adcs x6, x6, x17\n\t" - "adcs x7, x7, x19\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" - "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" - /* Store */ - "ldr x0, [x29, #24]\n\t" - "ldr x1, [x29, #16]\n\t" - /* Add */ - "adds x8, x12, x4\n\t" - "adcs x9, x13, x5\n\t" - "adcs x10, x14, x6\n\t" - "adc x11, x15, x7\n\t" + "adc x11, x11, xzr\n\t" + /* Add high product results in */ + "adds x8, x8, x27\n\t" + "adcs x9, x9, x16\n\t" + "adcs x10, x10, x17\n\t" + "adc x11, x11, x19\n\t" + /* Double */ + "adds x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, x11, x11\n\t" "mov x25, #-19\n\t" "asr x28, x11, #63\n\t" /* Mask the modulus */ @@ -6868,366 +7378,1155 @@ void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz "sbcs x9, x9, x28\n\t" "sbcs x10, x10, x28\n\t" "sbc x11, x11, x26\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + "mov x1, x0\n\t" + "sub x0, x0, #32\n\t" + /* Add */ + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "adds x21, x8, x4\n\t" + "adcs x22, x9, x5\n\t" + "adcs x23, x10, x6\n\t" + "adcs x24, x11, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x24, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x21, x21, x25\n\t" + "adcs x22, x22, xzr\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "adcs x23, x23, xzr\n\t" + "adc x24, x24, xzr\n\t" /* Sub */ - "subs x16, x12, x4\n\t" - "sbcs x17, x13, x5\n\t" - "sbcs x19, x14, x6\n\t" - "sbcs x20, x15, x7\n\t" - "mov x25, #-19\n\t" + "subs x12, x8, x4\n\t" + "sbcs x13, x9, x5\n\t" + "sbcs x14, x10, x6\n\t" + "sbcs x15, x11, x7\n\t" "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "stp x21, x22, [x0]\n\t" + "stp x23, x24, [x0, #16]\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" + "ldp x29, x30, [sp], #48\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +void ge_sub(ge_p1p1* r, const ge_p3* p, const ge_cached* q) +{ + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-48]!\n\t" + "add x29, sp, #0\n\t" + "str %w[r], [x29, #16]\n\t" + "str %w[p], [x29, #24]\n\t" + "str %w[q], [x29, #32]\n\t" + "mov x3, x1\n\t" + "add x2, x1, #32\n\t" + "add x1, x0, #32\n\t" + /* Add */ + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "adds x16, x8, x4\n\t" + "adcs x17, x9, x5\n\t" + "adcs x19, x10, x6\n\t" + "adcs x20, x11, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x20, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" - "stp x8, x9, [x0]\n\t" - "stp x10, x11, [x0, #16]\n\t" - "stp x16, x17, [x1]\n\t" - "stp x19, x20, [x1, #16]\n\t" - "ldr x0, [x29, #48]\n\t" - "ldr x1, [x29, #64]\n\t" - "ldr x2, [sp, #96]\n\t" + "adcs x17, x17, xzr\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" + /* Sub */ + "subs x12, x8, x4\n\t" + "sbcs x13, x9, x5\n\t" + "sbcs x14, x10, x6\n\t" + "sbcs x15, x11, x7\n\t" + "csetm x28, cc\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" + /* Add modulus (if underflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "ldr x2, [x29, #32]\n\t" + "add x2, x2, #32\n\t" + "mov x1, x0\n\t" + /* Multiply */ + "ldp x8, x9, [x2]\n\t" + "ldp x10, x11, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x22, x16, x8\n\t" + "mul x21, x16, x8\n\t" + /* A[2] * B[0] */ + "umulh x24, x19, x8\n\t" + "mul x23, x19, x8\n\t" + /* A[1] * B[0] */ + "mul x25, x17, x8\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x17, x8\n\t" + "adcs x23, x23, x26\n\t" + "adc x24, x24, xzr\n\t" + /* A[1] * B[3] */ + "umulh x5, x17, x11\n\t" + "mul x4, x17, x11\n\t" + /* A[0] * B[1] */ + "mul x25, x16, x9\n\t" + "adds x22, x22, x25\n\t" + "umulh x26, x16, x9\n\t" + "adcs x23, x23, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x19, x9\n\t" + "adcs x24, x24, x25\n\t" + "umulh x26, x19, x9\n\t" + "adcs x4, x4, x26\n\t" + "adc x5, x5, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x17, x10\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x17, x10\n\t" + "adcs x4, x4, x26\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x16, x10\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x16, x10\n\t" + "adcs x24, x24, x26\n\t" + "adcs x4, x4, xzr\n\t" + "adcs x5, x5, xzr\n\t" + "adc x6, x6, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x17, x9\n\t" + "adds x23, x23, x25\n\t" + "umulh x26, x17, x9\n\t" + "adcs x24, x24, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x20, x9\n\t" + "adcs x4, x4, x25\n\t" + "umulh x26, x20, x9\n\t" + "adcs x5, x5, x26\n\t" + "adc x6, x6, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x19, x10\n\t" + "adds x4, x4, x25\n\t" + "umulh x26, x19, x10\n\t" + "adcs x5, x5, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x20, x11\n\t" + "adcs x6, x6, x25\n\t" + "umulh x7, x20, x11\n\t" + "adc x7, x7, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x16, x11\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x16, x11\n\t" + "adcs x4, x4, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x19, x11\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x19, x11\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x20, x8\n\t" + "adds x24, x24, x25\n\t" + "umulh x26, x20, x8\n\t" + "adcs x4, x4, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x20, x10\n\t" + "adcs x5, x5, x25\n\t" + "umulh x26, x20, x10\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* Reduce */ + "mov x25, #38\n\t" + "mul x26, x25, x7\n\t" + "adds x24, x24, x26\n\t" + "umulh x27, x25, x7\n\t" + "adc x27, x27, xzr\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x24, #63\n\t" + "mul x27, x27, x25\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x4\n\t" + "adds x21, x21, x26\n\t" + "umulh x4, x25, x4\n\t" + "mul x26, x25, x5\n\t" + "adcs x22, x22, x26\n\t" + "umulh x5, x25, x5\n\t" + "mul x26, x25, x6\n\t" + "adcs x23, x23, x26\n\t" + "umulh x6, x25, x6\n\t" + "adc x24, x24, xzr\n\t" + /* Add high product results in */ + "adds x21, x21, x27\n\t" + "adcs x22, x22, x4\n\t" + "adcs x23, x23, x5\n\t" + "adc x24, x24, x6\n\t" + /* Reduce if top bit set */ + "mov x25, #19\n\t" + "and x26, x25, x24, asr 63\n\t" + "adds x21, x21, x26\n\t" + "adcs x22, x22, xzr\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "adcs x23, x23, xzr\n\t" + "adc x24, x24, xzr\n\t" + /* Store */ + "stp x21, x22, [x0]\n\t" + "stp x23, x24, [x0, #16]\n\t" + "sub x2, x2, #32\n\t" + "add x1, x0, #32\n\t" + "add x0, x0, #32\n\t" /* Multiply */ - "ldp x12, x13, [x1]\n\t" - "ldp x14, x15, [x1, #16]\n\t" "ldp x16, x17, [x2]\n\t" "ldp x19, x20, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x4, x12, x16\n\t" + /* A[0] * B[0] */ "umulh x5, x12, x16\n\t" - /* A[0] * B[1] */ - "mul x25, x12, x17\n\t" - "umulh x6, x12, x17\n\t" - "adds x5, x5, x25\n\t" - "adc x6, x6, xzr\n\t" - /* A[1] * B[0] */ + "mul x4, x12, x16\n\t" + /* A[2] * B[0] */ + "umulh x7, x14, x16\n\t" + "mul x6, x14, x16\n\t" + /* A[1] * B[0] */ "mul x25, x13, x16\n\t" + "adds x5, x5, x25\n\t" "umulh x26, x13, x16\n\t" + "adcs x6, x6, x26\n\t" + "adc x7, x7, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x13, x20\n\t" + "mul x8, x13, x20\n\t" + /* A[0] * B[1] */ + "mul x25, x12, x17\n\t" "adds x5, x5, x25\n\t" + "umulh x26, x12, x17\n\t" "adcs x6, x6, x26\n\t" - "adc x7, xzr, xzr\n\t" - /* A[0] * B[2] */ + /* A[2] * B[1] */ + "mul x25, x14, x17\n\t" + "adcs x7, x7, x25\n\t" + "umulh x26, x14, x17\n\t" + "adcs x8, x8, x26\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x13, x19\n\t" + "adds x7, x7, x25\n\t" + "umulh x26, x13, x19\n\t" + "adcs x8, x8, x26\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, xzr, xzr\n\t" + /* A[0] * B[2] */ "mul x25, x12, x19\n\t" - "umulh x26, x12, x19\n\t" - "adds x6, x6, x25\n\t" - "adc x7, x7, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x13, x17\n\t" - "umulh x26, x13, x17\n\t" "adds x6, x6, x25\n\t" + "umulh x26, x12, x19\n\t" "adcs x7, x7, x26\n\t" - "adc x8, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x14, x16\n\t" - "umulh x26, x14, x16\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x13, x17\n\t" "adds x6, x6, x25\n\t" + "umulh x26, x13, x17\n\t" "adcs x7, x7, x26\n\t" - "adc x8, x8, xzr\n\t" - /* A[0] * B[3] */ + /* A[3] * B[1] */ + "mul x25, x15, x17\n\t" + "adcs x8, x8, x25\n\t" + "umulh x26, x15, x17\n\t" + "adcs x9, x9, x26\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x14, x19\n\t" + "adds x8, x8, x25\n\t" + "umulh x26, x14, x19\n\t" + "adcs x9, x9, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x15, x20\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x15, x20\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ "mul x25, x12, x20\n\t" - "umulh x26, x12, x20\n\t" "adds x7, x7, x25\n\t" + "umulh x26, x12, x20\n\t" "adcs x8, x8, x26\n\t" - "adc x9, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x13, x19\n\t" - "umulh x26, x13, x19\n\t" + /* A[2] * B[3] */ + "mul x25, x14, x20\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x14, x20\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x15, x16\n\t" "adds x7, x7, x25\n\t" + "umulh x26, x15, x16\n\t" "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x14, x17\n\t" - "umulh x26, x14, x17\n\t" - "adds x7, x7, x25\n\t" + /* A[3] * B[2] */ + "mul x25, x15, x19\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x15, x19\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* Reduce */ + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x7, x7, x26\n\t" + "umulh x27, x25, x11\n\t" + "adc x27, x27, xzr\n\t" + "mov x25, #19\n\t" + "extr x27, x27, x7, #63\n\t" + "mul x27, x27, x25\n\t" + "and x7, x7, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x8\n\t" + "adds x4, x4, x26\n\t" + "umulh x8, x25, x8\n\t" + "mul x26, x25, x9\n\t" + "adcs x5, x5, x26\n\t" + "umulh x9, x25, x9\n\t" + "mul x26, x25, x10\n\t" + "adcs x6, x6, x26\n\t" + "umulh x10, x25, x10\n\t" + "adc x7, x7, xzr\n\t" + /* Add high product results in */ + "adds x4, x4, x27\n\t" + "adcs x5, x5, x8\n\t" + "adcs x6, x6, x9\n\t" + "adc x7, x7, x10\n\t" + /* Store */ + "stp x4, x5, [x0]\n\t" + "stp x6, x7, [x0, #16]\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + "sub x1, x0, #32\n\t" + /* Add */ + "adds x8, x21, x4\n\t" + "adcs x9, x22, x5\n\t" + "adcs x10, x23, x6\n\t" + "adcs x11, x24, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x11, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x8, x8, x25\n\t" + "adcs x9, x9, xzr\n\t" + "and x11, x11, #0x7fffffffffffffff\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + /* Sub */ + "subs x12, x21, x4\n\t" + "sbcs x13, x22, x5\n\t" + "sbcs x14, x23, x6\n\t" + "sbcs x15, x24, x7\n\t" + "csetm x28, cc\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" + /* Add modulus (if underflow) */ + "subs x12, x12, x25\n\t" + "sbcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "sbcs x14, x14, xzr\n\t" + "sbc x15, x15, xzr\n\t" + "stp x8, x9, [x0]\n\t" + "stp x10, x11, [x0, #16]\n\t" + "stp x12, x13, [x1]\n\t" + "stp x14, x15, [x1, #16]\n\t" + "ldr x1, [x29, #24]\n\t" + "ldr x2, [x29, #32]\n\t" + "add x2, x2, #0x60\n\t" + "add x1, x1, #0x60\n\t" + "add x0, x0, #0x40\n\t" + /* Multiply */ + "ldp x21, x22, [x1]\n\t" + "ldp x23, x24, [x1, #16]\n\t" + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + /* A[0] * B[0] */ + "umulh x17, x21, x4\n\t" + "mul x16, x21, x4\n\t" + /* A[2] * B[0] */ + "umulh x20, x23, x4\n\t" + "mul x19, x23, x4\n\t" + /* A[1] * B[0] */ + "mul x25, x22, x4\n\t" + "adds x17, x17, x25\n\t" + "umulh x26, x22, x4\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x22, x7\n\t" + "mul x8, x22, x7\n\t" + /* A[0] * B[1] */ + "mul x25, x21, x5\n\t" + "adds x17, x17, x25\n\t" + "umulh x26, x21, x5\n\t" + "adcs x19, x19, x26\n\t" + /* A[2] * B[1] */ + "mul x25, x23, x5\n\t" + "adcs x20, x20, x25\n\t" + "umulh x26, x23, x5\n\t" "adcs x8, x8, x26\n\t" "adc x9, x9, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x15, x16\n\t" - "umulh x26, x15, x16\n\t" - "adds x7, x7, x25\n\t" + /* A[1] * B[2] */ + "mul x25, x22, x6\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x22, x6\n\t" "adcs x8, x8, x26\n\t" - "adc x9, x9, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x13, x20\n\t" - "umulh x26, x13, x20\n\t" - "adds x8, x8, x25\n\t" - "adcs x9, x9, x26\n\t" + "adcs x9, x9, xzr\n\t" "adc x10, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x14, x19\n\t" - "umulh x26, x14, x19\n\t" - "adds x8, x8, x25\n\t" + /* A[0] * B[2] */ + "mul x25, x21, x6\n\t" + "adds x19, x19, x25\n\t" + "umulh x26, x21, x6\n\t" + "adcs x20, x20, x26\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x22, x5\n\t" + "adds x19, x19, x25\n\t" + "umulh x26, x22, x5\n\t" + "adcs x20, x20, x26\n\t" + /* A[3] * B[1] */ + "mul x25, x24, x5\n\t" + "adcs x8, x8, x25\n\t" + "umulh x26, x24, x5\n\t" "adcs x9, x9, x26\n\t" "adc x10, x10, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x15, x17\n\t" - "umulh x26, x15, x17\n\t" + /* A[2] * B[2] */ + "mul x25, x23, x6\n\t" "adds x8, x8, x25\n\t" + "umulh x26, x23, x6\n\t" "adcs x9, x9, x26\n\t" - "adc x10, x10, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x14, x20\n\t" - "umulh x26, x14, x20\n\t" - "adds x9, x9, x25\n\t" + /* A[3] * B[3] */ + "mul x25, x24, x7\n\t" + "adcs x10, x10, x25\n\t" + "umulh x11, x24, x7\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x21, x7\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x21, x7\n\t" + "adcs x8, x8, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x23, x7\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x23, x7\n\t" "adcs x10, x10, x26\n\t" - "adc x11, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x15, x19\n\t" - "umulh x26, x15, x19\n\t" - "adds x9, x9, x25\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x24, x4\n\t" + "adds x20, x20, x25\n\t" + "umulh x26, x24, x4\n\t" + "adcs x8, x8, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x24, x6\n\t" + "adcs x9, x9, x25\n\t" + "umulh x26, x24, x6\n\t" "adcs x10, x10, x26\n\t" "adc x11, x11, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x15, x20\n\t" - "umulh x26, x15, x20\n\t" - "adds x10, x10, x25\n\t" - "adc x11, x11, x26\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x11, x11, x10, #63\n\t" - "extr x10, x10, x9, #63\n\t" - "extr x9, x9, x8, #63\n\t" - "extr x8, x8, x7, #63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ + "mov x25, #38\n\t" + "mul x26, x25, x11\n\t" + "adds x20, x20, x26\n\t" + "umulh x27, x25, x11\n\t" + "adc x27, x27, xzr\n\t" "mov x25, #19\n\t" + "extr x27, x27, x20, #63\n\t" + "mul x27, x27, x25\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "mov x25, #38\n\t" "mul x26, x25, x8\n\t" + "adds x16, x16, x26\n\t" "umulh x8, x25, x8\n\t" - "adds x4, x4, x26\n\t" "mul x26, x25, x9\n\t" + "adcs x17, x17, x26\n\t" "umulh x9, x25, x9\n\t" - "adcs x5, x5, x26\n\t" "mul x26, x25, x10\n\t" + "adcs x19, x19, x26\n\t" "umulh x10, x25, x10\n\t" - "adcs x6, x6, x26\n\t" - "mul x26, x25, x11\n\t" - "umulh x27, x25, x11\n\t" - "adcs x7, x7, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x5, x5, x8\n\t" - "adcs x6, x6, x9\n\t" - "adcs x7, x7, x10\n\t" - "adc x27, x27, xzr\n\t" - /* Overflow */ - "extr x27, x27, x7, #63\n\t" - "mul x27, x27, x25\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "adc x20, x20, xzr\n\t" + /* Add high product results in */ + "adds x16, x16, x27\n\t" + "adcs x17, x17, x8\n\t" + "adcs x19, x19, x9\n\t" + "adc x20, x20, x10\n\t" /* Reduce if top bit set */ - "and x27, x25, x7, asr 63\n\t" - "and x7, x7, #0x7fffffffffffffff\n\t" - "adds x4, x4, x27\n\t" - "adcs x5, x5, xzr\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, xzr\n\t" + "mov x25, #19\n\t" + "and x26, x25, x20, asr 63\n\t" + "adds x16, x16, x26\n\t" + "adcs x17, x17, xzr\n\t" + "and x20, x20, #0x7fffffffffffffff\n\t" + "adcs x19, x19, xzr\n\t" + "adc x20, x20, xzr\n\t" /* Store */ - "ldr x0, [x29, #48]\n\t" - /* Double */ - "adds x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adc x7, x7, x7\n\t" - "mov x25, #-19\n\t" - "asr x28, x7, #63\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" - /* Sub modulus (if overflow) */ - "subs x4, x4, x25\n\t" - "sbcs x5, x5, x28\n\t" - "sbcs x6, x6, x28\n\t" - "sbc x7, x7, x26\n\t" - "ldr x0, [x29, #40]\n\t" - "ldr x1, [sp, #104]\n\t" - "ldr x2, [x29, #72]\n\t" + "stp x16, x17, [x0]\n\t" + "stp x19, x20, [x0, #16]\n\t" + "sub x3, x2, #32\n\t" + "sub x2, x1, #32\n\t" + "sub x1, x0, #32\n\t" /* Multiply */ - "ldp x16, x17, [x1]\n\t" - "ldp x19, x20, [x1, #16]\n\t" - "ldp x21, x22, [x2]\n\t" - "ldp x23, x24, [x2, #16]\n\t" - /* A[0] * B[0] */ - "mul x8, x16, x21\n\t" - "umulh x9, x16, x21\n\t" - /* A[0] * B[1] */ - "mul x25, x16, x22\n\t" - "umulh x10, x16, x22\n\t" + "ldp x4, x5, [x2]\n\t" + "ldp x6, x7, [x2, #16]\n\t" + "ldp x12, x13, [x3]\n\t" + "ldp x14, x15, [x3, #16]\n\t" + /* A[0] * B[0] */ + "umulh x9, x4, x12\n\t" + "mul x8, x4, x12\n\t" + /* A[2] * B[0] */ + "umulh x11, x6, x12\n\t" + "mul x10, x6, x12\n\t" + /* A[1] * B[0] */ + "mul x25, x5, x12\n\t" "adds x9, x9, x25\n\t" - "adc x10, x10, xzr\n\t" - /* A[1] * B[0] */ - "mul x25, x17, x21\n\t" - "umulh x26, x17, x21\n\t" + "umulh x26, x5, x12\n\t" + "adcs x10, x10, x26\n\t" + "adc x11, x11, xzr\n\t" + /* A[1] * B[3] */ + "umulh x17, x5, x15\n\t" + "mul x16, x5, x15\n\t" + /* A[0] * B[1] */ + "mul x25, x4, x13\n\t" "adds x9, x9, x25\n\t" + "umulh x26, x4, x13\n\t" "adcs x10, x10, x26\n\t" - "adc x11, xzr, xzr\n\t" - /* A[0] * B[2] */ - "mul x25, x16, x23\n\t" - "umulh x26, x16, x23\n\t" - "adds x10, x10, x25\n\t" - "adc x11, x11, x26\n\t" - /* A[1] * B[1] */ - "mul x25, x17, x22\n\t" - "umulh x26, x17, x22\n\t" + /* A[2] * B[1] */ + "mul x25, x6, x13\n\t" + "adcs x11, x11, x25\n\t" + "umulh x26, x6, x13\n\t" + "adcs x16, x16, x26\n\t" + "adc x17, x17, xzr\n\t" + /* A[1] * B[2] */ + "mul x25, x5, x14\n\t" + "adds x11, x11, x25\n\t" + "umulh x26, x5, x14\n\t" + "adcs x16, x16, x26\n\t" + "adcs x17, x17, xzr\n\t" + "adc x19, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x25, x4, x14\n\t" "adds x10, x10, x25\n\t" + "umulh x26, x4, x14\n\t" "adcs x11, x11, x26\n\t" - "adc x12, xzr, xzr\n\t" - /* A[2] * B[0] */ - "mul x25, x19, x21\n\t" - "umulh x26, x19, x21\n\t" + "adcs x16, x16, xzr\n\t" + "adcs x17, x17, xzr\n\t" + "adc x19, x19, xzr\n\t" + /* A[1] * B[1] */ + "mul x25, x5, x13\n\t" "adds x10, x10, x25\n\t" + "umulh x26, x5, x13\n\t" "adcs x11, x11, x26\n\t" - "adc x12, x12, xzr\n\t" - /* A[0] * B[3] */ - "mul x25, x16, x24\n\t" - "umulh x26, x16, x24\n\t" - "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, xzr, xzr\n\t" - /* A[1] * B[2] */ - "mul x25, x17, x23\n\t" - "umulh x26, x17, x23\n\t" - "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, x13, xzr\n\t" - /* A[2] * B[1] */ - "mul x25, x19, x22\n\t" - "umulh x26, x19, x22\n\t" + /* A[3] * B[1] */ + "mul x25, x7, x13\n\t" + "adcs x16, x16, x25\n\t" + "umulh x26, x7, x13\n\t" + "adcs x17, x17, x26\n\t" + "adc x19, x19, xzr\n\t" + /* A[2] * B[2] */ + "mul x25, x6, x14\n\t" + "adds x16, x16, x25\n\t" + "umulh x26, x6, x14\n\t" + "adcs x17, x17, x26\n\t" + /* A[3] * B[3] */ + "mul x25, x7, x15\n\t" + "adcs x19, x19, x25\n\t" + "umulh x20, x7, x15\n\t" + "adc x20, x20, xzr\n\t" + /* A[0] * B[3] */ + "mul x25, x4, x15\n\t" "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, x13, xzr\n\t" - /* A[3] * B[0] */ - "mul x25, x20, x21\n\t" - "umulh x26, x20, x21\n\t" + "umulh x26, x4, x15\n\t" + "adcs x16, x16, x26\n\t" + /* A[2] * B[3] */ + "mul x25, x6, x15\n\t" + "adcs x17, x17, x25\n\t" + "umulh x26, x6, x15\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" + /* A[3] * B[0] */ + "mul x25, x7, x12\n\t" "adds x11, x11, x25\n\t" - "adcs x12, x12, x26\n\t" - "adc x13, x13, xzr\n\t" - /* A[1] * B[3] */ - "mul x25, x17, x24\n\t" - "umulh x26, x17, x24\n\t" - "adds x12, x12, x25\n\t" - "adcs x13, x13, x26\n\t" - "adc x14, xzr, xzr\n\t" - /* A[2] * B[2] */ - "mul x25, x19, x23\n\t" - "umulh x26, x19, x23\n\t" - "adds x12, x12, x25\n\t" - "adcs x13, x13, x26\n\t" - "adc x14, x14, xzr\n\t" - /* A[3] * B[1] */ - "mul x25, x20, x22\n\t" - "umulh x26, x20, x22\n\t" - "adds x12, x12, x25\n\t" - "adcs x13, x13, x26\n\t" - "adc x14, x14, xzr\n\t" - /* A[2] * B[3] */ - "mul x25, x19, x24\n\t" - "umulh x26, x19, x24\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, xzr, xzr\n\t" - /* A[3] * B[2] */ - "mul x25, x20, x23\n\t" - "umulh x26, x20, x23\n\t" - "adds x13, x13, x25\n\t" - "adcs x14, x14, x26\n\t" - "adc x15, x15, xzr\n\t" - /* A[3] * B[3] */ - "mul x25, x20, x24\n\t" - "umulh x26, x20, x24\n\t" - "adds x14, x14, x25\n\t" - "adc x15, x15, x26\n\t" + "umulh x26, x7, x12\n\t" + "adcs x16, x16, x26\n\t" + /* A[3] * B[2] */ + "mul x25, x7, x14\n\t" + "adcs x17, x17, x25\n\t" + "umulh x26, x7, x14\n\t" + "adcs x19, x19, x26\n\t" + "adc x20, x20, xzr\n\t" /* Reduce */ - /* Move top half into t4-t7 and remove top bit from t3 */ - "extr x15, x15, x14, #63\n\t" - "extr x14, x14, x13, #63\n\t" - "extr x13, x13, x12, #63\n\t" - "extr x12, x12, x11, #63\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" - /* Multiply top half by 19 */ - "mov x25, #19\n\t" - "mul x26, x25, x12\n\t" - "umulh x12, x25, x12\n\t" - "adds x8, x8, x26\n\t" - "mul x26, x25, x13\n\t" - "umulh x13, x25, x13\n\t" - "adcs x9, x9, x26\n\t" - "mul x26, x25, x14\n\t" - "umulh x14, x25, x14\n\t" - "adcs x10, x10, x26\n\t" - "mul x26, x25, x15\n\t" - "umulh x27, x25, x15\n\t" - "adcs x11, x11, x26\n\t" - "adc x27, x27, xzr\n\t" - /* Add remaining product results in */ - "adds x9, x9, x12\n\t" - "adcs x10, x10, x13\n\t" - "adcs x11, x11, x14\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x20\n\t" + "adds x11, x11, x26\n\t" + "umulh x27, x25, x20\n\t" "adc x27, x27, xzr\n\t" - /* Overflow */ + "mov x25, #19\n\t" "extr x27, x27, x11, #63\n\t" "mul x27, x27, x25\n\t" "and x11, x11, #0x7fffffffffffffff\n\t" - "adds x8, x8, x27\n\t" - "adcs x9, x9, xzr\n\t" - "adcs x10, x10, xzr\n\t" + "mov x25, #38\n\t" + "mul x26, x25, x16\n\t" + "adds x8, x8, x26\n\t" + "umulh x16, x25, x16\n\t" + "mul x26, x25, x17\n\t" + "adcs x9, x9, x26\n\t" + "umulh x17, x25, x17\n\t" + "mul x26, x25, x19\n\t" + "adcs x10, x10, x26\n\t" + "umulh x19, x25, x19\n\t" "adc x11, x11, xzr\n\t" - /* Reduce if top bit set */ - "and x27, x25, x11, asr 63\n\t" - "and x11, x11, #0x7fffffffffffffff\n\t" + /* Add high product results in */ "adds x8, x8, x27\n\t" - "adcs x9, x9, xzr\n\t" - "adcs x10, x10, xzr\n\t" - "adc x11, x11, xzr\n\t" - /* Store */ - "ldr x0, [x29, #40]\n\t" - "ldr x1, [x29, #32]\n\t" - /* Add */ - "adds x12, x4, x8\n\t" - "adcs x13, x5, x9\n\t" - "adcs x14, x6, x10\n\t" - "adc x15, x7, x11\n\t" + "adcs x9, x9, x16\n\t" + "adcs x10, x10, x17\n\t" + "adc x11, x11, x19\n\t" + /* Double */ + "adds x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, x11, x11\n\t" "mov x25, #-19\n\t" - "asr x28, x15, #63\n\t" + "asr x28, x11, #63\n\t" /* Mask the modulus */ "and x25, x28, x25\n\t" "and x26, x28, #0x7fffffffffffffff\n\t" /* Sub modulus (if overflow) */ - "subs x12, x12, x25\n\t" - "sbcs x13, x13, x28\n\t" - "sbcs x14, x14, x28\n\t" - "sbc x15, x15, x26\n\t" + "subs x8, x8, x25\n\t" + "sbcs x9, x9, x28\n\t" + "sbcs x10, x10, x28\n\t" + "sbc x11, x11, x26\n\t" + "mov x3, x0\n\t" + "sub x2, x0, #32\n\t" + /* Add */ + "ldp x4, x5, [x3]\n\t" + "ldp x6, x7, [x3, #16]\n\t" + "adds x12, x8, x4\n\t" + "adcs x13, x9, x5\n\t" + "adcs x14, x10, x6\n\t" + "adcs x15, x11, x7\n\t" + "cset x28, cs\n\t" + "mov x25, #19\n\t" + "extr x28, x28, x15, #63\n\t" + "mul x25, x28, x25\n\t" + /* Sub modulus (if overflow) */ + "adds x12, x12, x25\n\t" + "adcs x13, x13, xzr\n\t" + "and x15, x15, #0x7fffffffffffffff\n\t" + "adcs x14, x14, xzr\n\t" + "adc x15, x15, xzr\n\t" /* Sub */ - "subs x16, x4, x8\n\t" - "sbcs x17, x5, x9\n\t" - "sbcs x19, x6, x10\n\t" - "sbcs x20, x7, x11\n\t" - "mov x25, #-19\n\t" + "subs x21, x8, x4\n\t" + "sbcs x22, x9, x5\n\t" + "sbcs x23, x10, x6\n\t" + "sbcs x24, x11, x7\n\t" "csetm x28, cc\n\t" - /* Mask the modulus */ - "and x25, x28, x25\n\t" - "and x26, x28, #0x7fffffffffffffff\n\t" + "mov x25, #-19\n\t" + "extr x28, x28, x24, #63\n\t" + "mul x25, x28, x25\n\t" /* Add modulus (if underflow) */ - "adds x16, x16, x25\n\t" - "adcs x17, x17, x28\n\t" - "adcs x19, x19, x28\n\t" - "adc x20, x20, x26\n\t" + "subs x21, x21, x25\n\t" + "sbcs x22, x22, xzr\n\t" + "and x24, x24, #0x7fffffffffffffff\n\t" + "sbcs x23, x23, xzr\n\t" + "sbc x24, x24, xzr\n\t" "stp x12, x13, [x0]\n\t" "stp x14, x15, [x0, #16]\n\t" - "stp x16, x17, [x1]\n\t" - "stp x19, x20, [x1, #16]\n\t" - "ldp x29, x30, [sp], #0x80\n\t" - : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qz] "+r" (qz), [qt2d] "+r" (qt2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx) + "stp x21, x22, [x1]\n\t" + "stp x23, x24, [x1, #16]\n\t" + "ldp x29, x30, [sp], #48\n\t" + : [r] "+r" (r), [p] "+r" (p), [q] "+r" (q) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +void sc_reduce(byte* s) +{ + __asm__ __volatile__ ( + "ldp x2, x3, [%x[s]]\n\t" + "ldp x4, x5, [%x[s], #16]\n\t" + "ldp x6, x7, [%x[s], #32]\n\t" + "ldp x8, x9, [%x[s], #48]\n\t" + "lsr x23, x9, #56\n\t" + "lsl x9, x9, #4\n\t" + "orr x9, x9, x8, lsr 60\n\t" + "lsl x8, x8, #4\n\t" + "orr x8, x8, x7, lsr 60\n\t" + "lsl x7, x7, #4\n\t" + "orr x7, x7, x6, lsr 60\n\t" + "lsl x6, x6, #4\n\t" + "mov x1, #15\n\t" + "orr x6, x6, x5, lsr 60\n\t" + "bic x5, x5, x1, lsl 60\n\t" + "bic x9, x9, x1, lsl 60\n\t" + /* Add order times bits 504..511 */ + "mov x11, #0x2c13\n\t" + "movk x11, #0xa30a, lsl 16\n\t" + "movk x11, #0x9ce5, lsl 32\n\t" + "movk x11, #0xa7ed, lsl 48\n\t" + "mov x13, #0x6329\n\t" + "movk x13, #0x5d08, lsl 16\n\t" + "movk x13, #0x621, lsl 32\n\t" + "movk x13, #0xeb21, lsl 48\n\t" + "mul x10, x23, x11\n\t" + "umulh x11, x23, x11\n\t" + "mul x12, x23, x13\n\t" + "umulh x13, x23, x13\n\t" + "adds x6, x6, x10\n\t" + "adcs x7, x7, x11\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, xzr\n\t" + "adds x7, x7, x12\n\t" + "adcs x8, x8, x13\n\t" + "adc x9, x9, xzr\n\t" + "subs x8, x8, x23\n\t" + "sbc x9, x9, xzr\n\t" + /* Sub product of top 4 words and order */ + "mov x1, #0x2c13\n\t" + "movk x1, #0xa30a, lsl 16\n\t" + "movk x1, #0x9ce5, lsl 32\n\t" + "movk x1, #0xa7ed, lsl 48\n\t" + "mul x10, x6, x1\n\t" + "umulh x11, x6, x1\n\t" + "mul x12, x7, x1\n\t" + "umulh x13, x7, x1\n\t" + "mul x14, x8, x1\n\t" + "umulh x15, x8, x1\n\t" + "mul x16, x9, x1\n\t" + "umulh x17, x9, x1\n\t" + "adds x2, x2, x10\n\t" + "adcs x3, x3, x11\n\t" + "adcs x4, x4, x14\n\t" + "adcs x5, x5, x15\n\t" + "adc x19, xzr, xzr\n\t" + "adds x3, x3, x12\n\t" + "adcs x4, x4, x13\n\t" + "adcs x5, x5, x16\n\t" + "adc x19, x19, x17\n\t" + "mov x1, #0x6329\n\t" + "movk x1, #0x5d08, lsl 16\n\t" + "movk x1, #0x621, lsl 32\n\t" + "movk x1, #0xeb21, lsl 48\n\t" + "mul x10, x6, x1\n\t" + "umulh x11, x6, x1\n\t" + "mul x12, x7, x1\n\t" + "umulh x13, x7, x1\n\t" + "mul x14, x8, x1\n\t" + "umulh x15, x8, x1\n\t" + "mul x16, x9, x1\n\t" + "umulh x17, x9, x1\n\t" + "adds x3, x3, x10\n\t" + "adcs x4, x4, x11\n\t" + "adcs x5, x5, x14\n\t" + "adcs x19, x19, x15\n\t" + "adc x20, xzr, xzr\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adcs x19, x19, x16\n\t" + "adc x20, x20, x17\n\t" + "subs x4, x4, x6\n\t" + "sbcs x5, x5, x7\n\t" + "sbcs x6, x19, x8\n\t" + "sbc x7, x20, x9\n\t" + "asr x23, x7, #57\n\t" + /* Conditionally subtract order starting at bit 125 */ + "mov x10, xzr\n\t" + "mov x13, xzr\n\t" + "mov x11, #0xba7d\n\t" + "movk x11, #0x4b9e, lsl 16\n\t" + "movk x11, #0x4c63, lsl 32\n\t" + "movk x11, #0xcb02, lsl 48\n\t" + "mov x12, #0xf39a\n\t" + "movk x12, #0xd45e, lsl 16\n\t" + "movk x12, #0xdf3b, lsl 32\n\t" + "movk x12, #0x29b, lsl 48\n\t" + "movk x10, #0xa000, lsl 48\n\t" + "movk x13, #0x200, lsl 48\n\t" + "and x10, x10, x23\n\t" + "and x11, x11, x23\n\t" + "and x12, x12, x23\n\t" + "and x13, x13, x23\n\t" + "adds x3, x3, x10\n\t" + "adcs x4, x4, x11\n\t" + "adcs x5, x5, x12\n\t" + "adcs x6, x6, xzr\n\t" + "adc x7, x7, x13\n\t" + /* Move bits 252-376 to own registers */ + "lsl x7, x7, #4\n\t" + "orr x7, x7, x6, lsr 60\n\t" + "lsl x6, x6, #4\n\t" + "mov x23, #15\n\t" + "orr x6, x6, x5, lsr 60\n\t" + "bic x5, x5, x23, lsl 60\n\t" + /* Sub product of top 2 words and order */ + /* * -5812631a5cf5d3ed */ + "mov x1, #0x2c13\n\t" + "movk x1, #0xa30a, lsl 16\n\t" + "movk x1, #0x9ce5, lsl 32\n\t" + "movk x1, #0xa7ed, lsl 48\n\t" + "mul x10, x6, x1\n\t" + "umulh x11, x6, x1\n\t" + "mul x12, x7, x1\n\t" + "umulh x13, x7, x1\n\t" + "adds x2, x2, x10\n\t" + "adcs x3, x3, x11\n\t" + "adc x19, xzr, xzr\n\t" + "adds x3, x3, x12\n\t" + "adc x19, x19, x13\n\t" + /* * -14def9dea2f79cd7 */ + "mov x1, #0x6329\n\t" + "movk x1, #0x5d08, lsl 16\n\t" + "movk x1, #0x621, lsl 32\n\t" + "movk x1, #0xeb21, lsl 48\n\t" + "mul x10, x6, x1\n\t" + "umulh x11, x6, x1\n\t" + "mul x12, x7, x1\n\t" + "umulh x13, x7, x1\n\t" + "adds x3, x3, x10\n\t" + "adcs x4, x4, x11\n\t" + "adc x20, xzr, xzr\n\t" + "adds x4, x4, x12\n\t" + "adc x20, x20, x13\n\t" + /* Add overflows at 2 * 64 */ + "mov x1, #15\n\t" + "bic x5, x5, x1, lsl 60\n\t" + "adds x4, x4, x19\n\t" + "adc x5, x5, x20\n\t" + /* Subtract top at 2 * 64 */ + "subs x4, x4, x6\n\t" + "sbcs x5, x5, x7\n\t" + "sbc x1, x1, x1\n\t" + /* Conditional sub order */ + "mov x10, #0xd3ed\n\t" + "movk x10, #0x5cf5, lsl 16\n\t" + "movk x10, #0x631a, lsl 32\n\t" + "movk x10, #0x5812, lsl 48\n\t" + "mov x11, #0x9cd6\n\t" + "movk x11, #0xa2f7, lsl 16\n\t" + "movk x11, #0xf9de, lsl 32\n\t" + "movk x11, #0x14de, lsl 48\n\t" + "and x10, x10, x1\n\t" + "and x11, x11, x1\n\t" + "adds x2, x2, x10\n\t" + "adcs x3, x3, x11\n\t" + "and x1, x1, #0x1000000000000000\n\t" + "adcs x4, x4, xzr\n\t" + "mov x23, #15\n\t" + "adc x5, x5, x1\n\t" + "bic x5, x5, x23, lsl 60\n\t" + /* Store result */ + "stp x2, x3, [%x[s]]\n\t" + "stp x4, x5, [%x[s], #16]\n\t" + : [s] "+r" (s) + : + : "memory", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23" + ); +} + +void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) +{ + __asm__ __volatile__ ( + /* Multiply */ + "ldp x12, x13, [%x[a]]\n\t" + "ldp x14, x15, [%x[a], #16]\n\t" + "ldp x16, x17, [%x[b]]\n\t" + "ldp x19, x20, [%x[b], #16]\n\t" + /* A[0] * B[0] */ + "umulh x5, x12, x16\n\t" + "mul x4, x12, x16\n\t" + /* A[2] * B[0] */ + "umulh x7, x14, x16\n\t" + "mul x6, x14, x16\n\t" + /* A[1] * B[0] */ + "mul x21, x13, x16\n\t" + "adds x5, x5, x21\n\t" + "umulh x22, x13, x16\n\t" + "adcs x6, x6, x22\n\t" + "adc x7, x7, xzr\n\t" + /* A[1] * B[3] */ + "umulh x9, x13, x20\n\t" + "mul x8, x13, x20\n\t" + /* A[0] * B[1] */ + "mul x21, x12, x17\n\t" + "adds x5, x5, x21\n\t" + "umulh x22, x12, x17\n\t" + "adcs x6, x6, x22\n\t" + /* A[2] * B[1] */ + "mul x21, x14, x17\n\t" + "adcs x7, x7, x21\n\t" + "umulh x22, x14, x17\n\t" + "adcs x8, x8, x22\n\t" + "adc x9, x9, xzr\n\t" + /* A[1] * B[2] */ + "mul x21, x13, x19\n\t" + "adds x7, x7, x21\n\t" + "umulh x22, x13, x19\n\t" + "adcs x8, x8, x22\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, xzr, xzr\n\t" + /* A[0] * B[2] */ + "mul x21, x12, x19\n\t" + "adds x6, x6, x21\n\t" + "umulh x22, x12, x19\n\t" + "adcs x7, x7, x22\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adc x10, x10, xzr\n\t" + /* A[1] * B[1] */ + "mul x21, x13, x17\n\t" + "adds x6, x6, x21\n\t" + "umulh x22, x13, x17\n\t" + "adcs x7, x7, x22\n\t" + /* A[3] * B[1] */ + "mul x21, x15, x17\n\t" + "adcs x8, x8, x21\n\t" + "umulh x22, x15, x17\n\t" + "adcs x9, x9, x22\n\t" + "adc x10, x10, xzr\n\t" + /* A[2] * B[2] */ + "mul x21, x14, x19\n\t" + "adds x8, x8, x21\n\t" + "umulh x22, x14, x19\n\t" + "adcs x9, x9, x22\n\t" + /* A[3] * B[3] */ + "mul x21, x15, x20\n\t" + "adcs x10, x10, x21\n\t" + "umulh x11, x15, x20\n\t" + "adc x11, x11, xzr\n\t" + /* A[0] * B[3] */ + "mul x21, x12, x20\n\t" + "adds x7, x7, x21\n\t" + "umulh x22, x12, x20\n\t" + "adcs x8, x8, x22\n\t" + /* A[2] * B[3] */ + "mul x21, x14, x20\n\t" + "adcs x9, x9, x21\n\t" + "umulh x22, x14, x20\n\t" + "adcs x10, x10, x22\n\t" + "adc x11, x11, xzr\n\t" + /* A[3] * B[0] */ + "mul x21, x15, x16\n\t" + "adds x7, x7, x21\n\t" + "umulh x22, x15, x16\n\t" + "adcs x8, x8, x22\n\t" + /* A[3] * B[2] */ + "mul x21, x15, x19\n\t" + "adcs x9, x9, x21\n\t" + "umulh x22, x15, x19\n\t" + "adcs x10, x10, x22\n\t" + "adc x11, x11, xzr\n\t" + /* Add c to a * b */ + "ldp x12, x13, [%x[c]]\n\t" + "ldp x14, x15, [%x[c], #16]\n\t" + "adds x4, x4, x12\n\t" + "adcs x5, x5, x13\n\t" + "adcs x6, x6, x14\n\t" + "adcs x7, x7, x15\n\t" + "adcs x8, x8, xzr\n\t" + "adcs x9, x9, xzr\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + "lsr x25, x11, #56\n\t" + "lsl x11, x11, #4\n\t" + "orr x11, x11, x10, lsr 60\n\t" + "lsl x10, x10, #4\n\t" + "orr x10, x10, x9, lsr 60\n\t" + "lsl x9, x9, #4\n\t" + "orr x9, x9, x8, lsr 60\n\t" + "lsl x8, x8, #4\n\t" + "mov x26, #15\n\t" + "orr x8, x8, x7, lsr 60\n\t" + "bic x7, x7, x26, lsl 60\n\t" + "bic x11, x11, x26, lsl 60\n\t" + /* Add order times bits 504..507 */ + "mov x22, #0x2c13\n\t" + "movk x22, #0xa30a, lsl 16\n\t" + "movk x22, #0x9ce5, lsl 32\n\t" + "movk x22, #0xa7ed, lsl 48\n\t" + "mov x24, #0x6329\n\t" + "movk x24, #0x5d08, lsl 16\n\t" + "movk x24, #0x621, lsl 32\n\t" + "movk x24, #0xeb21, lsl 48\n\t" + "mul x21, x25, x22\n\t" + "umulh x22, x25, x22\n\t" + "mul x23, x25, x24\n\t" + "umulh x24, x25, x24\n\t" + "adds x8, x8, x21\n\t" + "adcs x9, x9, x22\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, x11, xzr\n\t" + "adds x9, x9, x23\n\t" + "adcs x10, x10, x24\n\t" + "adc x11, x11, xzr\n\t" + "subs x10, x10, x25\n\t" + "sbc x11, x11, xzr\n\t" + /* Sub product of top 4 words and order */ + "mov x26, #0x2c13\n\t" + "movk x26, #0xa30a, lsl 16\n\t" + "movk x26, #0x9ce5, lsl 32\n\t" + "movk x26, #0xa7ed, lsl 48\n\t" + "mul x16, x8, x26\n\t" + "umulh x17, x8, x26\n\t" + "mul x19, x9, x26\n\t" + "umulh x20, x9, x26\n\t" + "mul x21, x10, x26\n\t" + "umulh x22, x10, x26\n\t" + "mul x23, x11, x26\n\t" + "umulh x24, x11, x26\n\t" + "adds x4, x4, x16\n\t" + "adcs x5, x5, x17\n\t" + "adcs x6, x6, x21\n\t" + "adcs x7, x7, x22\n\t" + "adc x12, xzr, xzr\n\t" + "adds x5, x5, x19\n\t" + "adcs x6, x6, x20\n\t" + "adcs x7, x7, x23\n\t" + "adc x12, x12, x24\n\t" + "mov x26, #0x6329\n\t" + "movk x26, #0x5d08, lsl 16\n\t" + "movk x26, #0x621, lsl 32\n\t" + "movk x26, #0xeb21, lsl 48\n\t" + "mul x16, x8, x26\n\t" + "umulh x17, x8, x26\n\t" + "mul x19, x9, x26\n\t" + "umulh x20, x9, x26\n\t" + "mul x21, x10, x26\n\t" + "umulh x22, x10, x26\n\t" + "mul x23, x11, x26\n\t" + "umulh x24, x11, x26\n\t" + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x21\n\t" + "adcs x12, x12, x22\n\t" + "adc x13, xzr, xzr\n\t" + "adds x6, x6, x19\n\t" + "adcs x7, x7, x20\n\t" + "adcs x12, x12, x23\n\t" + "adc x13, x13, x24\n\t" + "subs x6, x6, x8\n\t" + "sbcs x7, x7, x9\n\t" + "sbcs x8, x12, x10\n\t" + "sbc x9, x13, x11\n\t" + "asr x25, x9, #57\n\t" + /* Conditionally subtract order starting at bit 125 */ + "mov x16, xzr\n\t" + "mov x20, xzr\n\t" + "mov x17, #0xba7d\n\t" + "movk x17, #0x4b9e, lsl 16\n\t" + "movk x17, #0x4c63, lsl 32\n\t" + "movk x17, #0xcb02, lsl 48\n\t" + "mov x19, #0xf39a\n\t" + "movk x19, #0xd45e, lsl 16\n\t" + "movk x19, #0xdf3b, lsl 32\n\t" + "movk x19, #0x29b, lsl 48\n\t" + "movk x16, #0xa000, lsl 48\n\t" + "movk x20, #0x200, lsl 48\n\t" + "and x16, x16, x25\n\t" + "and x17, x17, x25\n\t" + "and x19, x19, x25\n\t" + "and x20, x20, x25\n\t" + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adcs x7, x7, x19\n\t" + "adcs x8, x8, xzr\n\t" + "adc x9, x9, x20\n\t" + /* Move bits 252-376 to own registers */ + "lsl x9, x9, #4\n\t" + "orr x9, x9, x8, lsr 60\n\t" + "lsl x8, x8, #4\n\t" + "mov x25, #15\n\t" + "orr x8, x8, x7, lsr 60\n\t" + "bic x7, x7, x25, lsl 60\n\t" + /* Sub product of top 2 words and order */ + /* * -5812631a5cf5d3ed */ + "mov x26, #0x2c13\n\t" + "movk x26, #0xa30a, lsl 16\n\t" + "movk x26, #0x9ce5, lsl 32\n\t" + "movk x26, #0xa7ed, lsl 48\n\t" + "mul x16, x8, x26\n\t" + "umulh x17, x8, x26\n\t" + "mul x19, x9, x26\n\t" + "umulh x20, x9, x26\n\t" + "adds x4, x4, x16\n\t" + "adcs x5, x5, x17\n\t" + "adc x12, xzr, xzr\n\t" + "adds x5, x5, x19\n\t" + "adc x12, x12, x20\n\t" + /* * -14def9dea2f79cd7 */ + "mov x26, #0x6329\n\t" + "movk x26, #0x5d08, lsl 16\n\t" + "movk x26, #0x621, lsl 32\n\t" + "movk x26, #0xeb21, lsl 48\n\t" + "mul x16, x8, x26\n\t" + "umulh x17, x8, x26\n\t" + "mul x19, x9, x26\n\t" + "umulh x20, x9, x26\n\t" + "adds x5, x5, x16\n\t" + "adcs x6, x6, x17\n\t" + "adc x13, xzr, xzr\n\t" + "adds x6, x6, x19\n\t" + "adc x13, x13, x20\n\t" + /* Add overflows at 2 * 64 */ + "mov x26, #15\n\t" + "bic x7, x7, x26, lsl 60\n\t" + "adds x6, x6, x12\n\t" + "adc x7, x7, x13\n\t" + /* Subtract top at 2 * 64 */ + "subs x6, x6, x8\n\t" + "sbcs x7, x7, x9\n\t" + "sbc x26, x26, x26\n\t" + /* Conditional sub order */ + "mov x16, #0xd3ed\n\t" + "movk x16, #0x5cf5, lsl 16\n\t" + "movk x16, #0x631a, lsl 32\n\t" + "movk x16, #0x5812, lsl 48\n\t" + "mov x17, #0x9cd6\n\t" + "movk x17, #0xa2f7, lsl 16\n\t" + "movk x17, #0xf9de, lsl 32\n\t" + "movk x17, #0x14de, lsl 48\n\t" + "and x16, x16, x26\n\t" + "and x17, x17, x26\n\t" + "adds x4, x4, x16\n\t" + "adcs x5, x5, x17\n\t" + "and x26, x26, #0x1000000000000000\n\t" + "adcs x6, x6, xzr\n\t" + "mov x25, #15\n\t" + "adc x7, x7, x26\n\t" + "bic x7, x7, x25, lsl 60\n\t" + /* Store result */ + "stp x4, x5, [%x[s]]\n\t" + "stp x6, x7, [%x[s], #16]\n\t" + : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) : - : "memory", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26" ); } -#endif /* HAVE_CURVE25519 */ +#endif /* HAVE_ED25519 */ +#endif /* !CURVE25519_SMALL || !ED25519_SMALL */ +#endif /* HAVE_CURVE25519 || HAVE_ED25519 */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index beea8d01f7..12a5132fcf 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -1318,7 +1318,7 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash) extern void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); -/* ARMv8 hardware acceleration Aarch32 */ +/* ARMv8 hardware acceleration Aarch32 and Thumb2 */ static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) { int ret = 0; diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm.S b/wolfcrypt/src/port/arm/armv8-sha3-asm.S index b4c5d76941..209ee0cf48 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm.S @@ -30,6 +30,7 @@ */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ +#ifndef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_SHA3 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 #ifndef __APPLE__ @@ -213,3 +214,4 @@ L_sha3_crypto_begin: #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c index 54423e44fc..0e6dc056e2 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c @@ -30,6 +30,7 @@ */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ +#ifdef WOLFSSL_ARMASM_INLINE #include #ifdef WOLFSSL_SHA3 @@ -183,3 +184,4 @@ void BlockSha3(unsigned long* state) #endif /* WOLFSSL_SHA3 */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-sha512-asm.S index 06ceddcbea..3ff015800b 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm.S @@ -30,6 +30,7 @@ */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ +#ifndef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_SHA512 #ifndef WOLFSSL_ARMASM_CRYPTO_SHA512 #ifndef __APPLE__ @@ -1739,3 +1740,4 @@ L_sha512_len_crypto_begin: #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c index 36eee81276..62f2ecbea3 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c @@ -30,6 +30,7 @@ */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ +#ifdef WOLFSSL_ARMASM_INLINE #include #ifdef WOLFSSL_SHA512 @@ -1665,3 +1666,4 @@ void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data, word32 len #endif /* WOLFSSL_SHA512 */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-sha512.c b/wolfcrypt/src/port/arm/armv8-sha512.c index 4a0a578408..45806249a8 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512.c +++ b/wolfcrypt/src/port/arm/armv8-sha512.c @@ -146,23 +146,6 @@ static int InitSha512_256(wc_Sha512* sha512) #ifdef WOLFSSL_SHA512 -#ifdef WOLFSSL_ARMASM -#ifdef __aarch64__ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512 - extern void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, - word32 len); - #define Transform_Sha512_Len Transform_Sha512_Len_neon -#else - extern void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data, - word32 len); - #define Transform_Sha512_Len Transform_Sha512_Len_crypto -#endif -#else -extern void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, - word32 len); -#endif -#endif - static int InitSha512_Family(wc_Sha512* sha512, void* heap, int devId, enum wc_HashType type) { diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S new file mode 100644 index 0000000000..c6d1f9d2ef --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S @@ -0,0 +1,4056 @@ +/* thumb2-curve25519 + * + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-curve25519.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) +#if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) + + .text + .align 4 + .globl fe_init + .type fe_init, %function +fe_init: + BX lr + # Cycle Count = 4 + .size fe_init,.-fe_init + .text + .align 4 + .globl fe_add_sub_op + .type fe_add_sub_op, %function +fe_add_sub_op: + PUSH {lr} + # Add-Sub + LDRD r4, r5, [r2] + LDRD r6, r7, [r3] + # Add + ADDS r8, r4, r6 + MOV r12, #0x0 + ADCS r9, r5, r7 + ADC r12, r12, #0x0 + STRD r8, r9, [r0] + # Sub + SUBS r10, r4, r6 + SBCS r11, r5, r7 + STRD r10, r11, [r1] + LDRD r4, r5, [r2, #8] + LDRD r6, r7, [r3, #8] + # Sub + SBCS r10, r4, r6 + MOV lr, #0x0 + SBCS r11, r5, r7 + ADC lr, lr, #0x0 + STRD r10, r11, [r1, #8] + # Add + SUBS r12, r12, #0x1 + ADCS r8, r4, r6 + ADCS r9, r5, r7 + STRD r8, r9, [r0, #8] + LDRD r4, r5, [r2, #16] + LDRD r6, r7, [r3, #16] + # Add + ADCS r8, r4, r6 + MOV r12, #0x0 + ADCS r9, r5, r7 + ADC r12, r12, #0x0 + STRD r8, r9, [r0, #16] + # Sub + SUBS lr, lr, #0x1 + SBCS r10, r4, r6 + SBCS r11, r5, r7 + STRD r10, r11, [r1, #16] + LDRD r4, r5, [r2, #24] + LDRD r6, r7, [r3, #24] + # Sub + SBCS r10, r4, r6 + SBC r11, r5, r7 + # Add + SUBS r12, r12, #0x1 + ADCS r8, r4, r6 + MOV r12, #0x0 + ADCS r9, r5, r7 + ADC r12, r12, #0x0 + # Multiply -modulus by overflow + LSL r3, r12, #1 + MOV r12, #0x13 + ORR r3, r3, r9, LSR #31 + MUL r12, r3, r12 + # Add -x*modulus (if overflow) + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + ADDS r4, r4, r12 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + STRD r4, r5, [r0] + STRD r6, r7, [r0, #8] + LDRD r4, r5, [r0, #16] + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + STRD r4, r5, [r0, #16] + BFC r9, #31, #1 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + STRD r8, r9, [r0, #24] + # Add -modulus on underflow + MOV lr, #0x13 + AND lr, lr, r11, ASR #31 + LDM r1, {r4, r5, r6, r7, r8, r9} + SUBS r4, r4, lr + SBCS r5, r5, #0x0 + SBCS r6, r6, #0x0 + SBCS r7, r7, #0x0 + SBCS r8, r8, #0x0 + SBCS r9, r9, #0x0 + BFC r11, #31, #1 + SBCS r10, r10, #0x0 + SBC r11, r11, #0x0 + STM r1, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Add-Sub + POP {pc} + # Cycle Count = 134 + .size fe_add_sub_op,.-fe_add_sub_op + .text + .align 4 + .globl fe_sub_op + .type fe_sub_op, %function +fe_sub_op: + PUSH {lr} + # Sub + LDM r2!, {r6, r7, r8, r9, r10, r11, r12, lr} + LDM r1!, {r2, r3, r4, r5} + SUBS r6, r2, r6 + SBCS r7, r3, r7 + SBCS r8, r4, r8 + SBCS r9, r5, r9 + LDM r1!, {r2, r3, r4, r5} + SBCS r10, r2, r10 + SBCS r11, r3, r11 + SBCS r12, r4, r12 + SBC lr, r5, lr + MOV r2, #0x13 + AND r2, r2, lr, ASR #31 + SUBS r6, r6, r2 + SBCS r7, r7, #0x0 + SBCS r8, r8, #0x0 + SBCS r9, r9, #0x0 + SBCS r10, r10, #0x0 + SBCS r11, r11, #0x0 + BFC lr, #31, #1 + SBCS r12, r12, #0x0 + SBC lr, lr, #0x0 + STM r0, {r6, r7, r8, r9, r10, r11, r12, lr} + # Done Sub + POP {pc} + # Cycle Count = 51 + .size fe_sub_op,.-fe_sub_op + .text + .align 4 + .globl fe_sub + .type fe_sub, %function +fe_sub: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + BL fe_sub_op + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 24 + .size fe_sub,.-fe_sub + .text + .align 4 + .globl fe_add_op + .type fe_add_op, %function +fe_add_op: + PUSH {lr} + # Add + LDM r2!, {r6, r7, r8, r9, r10, r11, r12, lr} + LDM r1!, {r2, r3, r4, r5} + ADDS r6, r2, r6 + ADCS r7, r3, r7 + ADCS r8, r4, r8 + ADCS r9, r5, r9 + LDM r1!, {r2, r3, r4, r5} + ADCS r10, r2, r10 + ADCS r11, r3, r11 + ADCS r12, r4, r12 + ADC lr, r5, lr + MOV r2, #0x13 + AND r2, r2, lr, ASR #31 + ADDS r6, r6, r2 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADCS r9, r9, #0x0 + ADCS r10, r10, #0x0 + ADCS r11, r11, #0x0 + BFC lr, #31, #1 + ADCS r12, r12, #0x0 + ADC lr, lr, #0x0 + STM r0, {r6, r7, r8, r9, r10, r11, r12, lr} + # Done Add + POP {pc} + # Cycle Count = 51 + .size fe_add_op,.-fe_add_op + .text + .align 4 + .globl fe_add + .type fe_add, %function +fe_add: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + BL fe_add_op + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 24 + .size fe_add,.-fe_add +#ifdef HAVE_ED25519 + .text + .align 4 + .globl fe_frombytes + .type fe_frombytes, %function +fe_frombytes: + PUSH {r4, r5, r6, r7, r8, r9, lr} + LDM r1, {r2, r3, r4, r5, r6, r7, r8, r9} + BFC r9, #31, #1 + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + POP {r4, r5, r6, r7, r8, r9, pc} + # Cycle Count = 35 + .size fe_frombytes,.-fe_frombytes + .text + .align 4 + .globl fe_tobytes + .type fe_tobytes, %function +fe_tobytes: + PUSH {r4, r5, r6, r7, r8, r9, r10, lr} + LDM r1, {r2, r3, r4, r5, r6, r7, r8, r9} + ADDS r10, r2, #0x13 + ADCS r10, r3, #0x0 + ADCS r10, r4, #0x0 + ADCS r10, r5, #0x0 + ADCS r10, r6, #0x0 + ADCS r10, r7, #0x0 + ADCS r10, r8, #0x0 + ADC r10, r9, #0x0 + ASR r10, r10, #31 + AND r10, r10, #0x13 + ADDS r2, r2, r10 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + BFC r9, #31, #1 + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + POP {r4, r5, r6, r7, r8, r9, r10, pc} + # Cycle Count = 55 + .size fe_tobytes,.-fe_tobytes + .text + .align 4 + .globl fe_1 + .type fe_1, %function +fe_1: + # Set one + MOV r2, #0x1 + MOV r3, #0x0 + STRD r2, r3, [r0] + MOV r2, #0x0 + STRD r2, r3, [r0, #8] + STRD r2, r3, [r0, #16] + STRD r2, r3, [r0, #24] + BX lr + # Cycle Count = 19 + .size fe_1,.-fe_1 + .text + .align 4 + .globl fe_0 + .type fe_0, %function +fe_0: + # Set zero + MOV r2, #0x0 + MOV r3, #0x0 + STRD r2, r3, [r0] + STRD r2, r3, [r0, #8] + STRD r2, r3, [r0, #16] + STRD r2, r3, [r0, #24] + BX lr + # Cycle Count = 18 + .size fe_0,.-fe_0 + .text + .align 4 + .globl fe_copy + .type fe_copy, %function +fe_copy: + PUSH {r4, r5, lr} + # Copy + LDRD r2, r3, [r1] + LDRD r4, r5, [r1, #8] + STRD r2, r3, [r0] + STRD r4, r5, [r0, #8] + LDRD r2, r3, [r1, #16] + LDRD r4, r5, [r1, #24] + STRD r2, r3, [r0, #16] + STRD r4, r5, [r0, #24] + POP {r4, r5, pc} + # Cycle Count = 32 + .size fe_copy,.-fe_copy + .text + .align 4 + .globl fe_neg + .type fe_neg, %function +fe_neg: + PUSH {r4, r5, r6, r7, lr} + MVN r7, #0x0 + MVN r6, #0x12 + LDM r1!, {r2, r3, r4, r5} + SUBS r2, r6, r2 + SBCS r3, r7, r3 + SBCS r4, r7, r4 + SBCS r5, r7, r5 + STM r0!, {r2, r3, r4, r5} + MVN r6, #0x80000000 + LDM r1!, {r2, r3, r4, r5} + SBCS r2, r7, r2 + SBCS r3, r7, r3 + SBCS r4, r7, r4 + SBC r5, r6, r5 + STM r0!, {r2, r3, r4, r5} + POP {r4, r5, r6, r7, pc} + # Cycle Count = 43 + .size fe_neg,.-fe_neg + .text + .align 4 + .globl fe_isnonzero + .type fe_isnonzero, %function +fe_isnonzero: + PUSH {r4, r5, r6, r7, r8, r9, r10, lr} + LDM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + ADDS r1, r2, #0x13 + ADCS r1, r3, #0x0 + ADCS r1, r4, #0x0 + ADCS r1, r5, #0x0 + ADCS r1, r6, #0x0 + ADCS r1, r7, #0x0 + ADCS r1, r8, #0x0 + ADC r1, r9, #0x0 + ASR r1, r1, #31 + AND r1, r1, #0x13 + ADDS r2, r2, r1 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + BFC r9, #31, #1 + ORR r2, r2, r3 + ORR r4, r4, r5 + ORR r6, r6, r7 + ORR r8, r8, r9 + ORR r4, r4, r6 + ORR r2, r2, r8 + ORR r0, r2, r4 + POP {r4, r5, r6, r7, r8, r9, r10, pc} + # Cycle Count = 53 + .size fe_isnonzero,.-fe_isnonzero + .text + .align 4 + .globl fe_isnegative + .type fe_isnegative, %function +fe_isnegative: + PUSH {r4, r5, lr} + LDM r0!, {r2, r3, r4, r5} + ADDS r1, r2, #0x13 + ADCS r1, r3, #0x0 + ADCS r1, r4, #0x0 + ADCS r1, r5, #0x0 + LDM r0, {r2, r3, r4, r5} + ADCS r1, r2, #0x0 + ADCS r1, r3, #0x0 + ADCS r1, r4, #0x0 + LDR r2, [r0, #-16] + ADC r1, r5, #0x0 + AND r0, r2, #0x1 + LSR r1, r1, #31 + EOR r0, r0, r1 + POP {r4, r5, pc} + # Cycle Count = 31 + .size fe_isnegative,.-fe_isnegative +#ifndef WC_NO_CACHE_RESISTANT + .text + .align 4 + .globl fe_cmov_table + .type fe_cmov_table, %function +fe_cmov_table: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SXTB r2, r2 + SBFX r3, r2, #7, #1 + EOR r12, r2, r3 + SUB r12, r12, r3 + MOV r4, #0x1 + MOV r5, #0x0 + MOV r6, #0x1 + MOV r7, #0x0 + MOV r8, #0x0 + MOV r9, #0x0 + MOV r3, #0x80000000 + ROR r3, r3, #31 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #30 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #29 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #28 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #27 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #26 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #25 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #24 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #32] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #64] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + SUB r1, r1, #0x2a0 + MVN r10, #0x12 + MVN r11, #0x0 + SUBS r10, r10, r8 + SBCS r11, r11, r9 + SBC lr, lr, lr + ASR r12, r2, #31 + EOR r3, r4, r6 + AND r3, r3, r12 + EOR r4, r4, r3 + EOR r6, r6, r3 + EOR r3, r5, r7 + AND r3, r3, r12 + EOR r5, r5, r3 + EOR r7, r7, r3 + EOR r10, r10, r8 + AND r10, r10, r12 + EOR r8, r8, r10 + EOR r11, r11, r9 + AND r11, r11, r12 + EOR r9, r9, r11 + STRD r4, r5, [r0] + STRD r6, r7, [r0, #32] + STRD r8, r9, [r0, #64] + SBFX r3, r2, #7, #1 + EOR r12, r2, r3 + SUB r12, r12, r3 + MOV r4, #0x0 + MOV r5, #0x0 + MOV r6, #0x0 + MOV r7, #0x0 + MOV r8, #0x0 + MOV r9, #0x0 + MOV r3, #0x80000000 + ROR r3, r3, #31 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #30 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #29 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #28 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #27 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #26 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #25 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #24 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #8] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #40] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #72] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + SUB r1, r1, #0x2a0 + MVN r10, #0x0 + MVN r11, #0x0 + RSBS lr, lr, #0x0 + SBCS r10, r10, r8 + SBCS r11, r11, r9 + SBC lr, lr, lr + ASR r12, r2, #31 + EOR r3, r4, r6 + AND r3, r3, r12 + EOR r4, r4, r3 + EOR r6, r6, r3 + EOR r3, r5, r7 + AND r3, r3, r12 + EOR r5, r5, r3 + EOR r7, r7, r3 + EOR r10, r10, r8 + AND r10, r10, r12 + EOR r8, r8, r10 + EOR r11, r11, r9 + AND r11, r11, r12 + EOR r9, r9, r11 + STRD r4, r5, [r0, #8] + STRD r6, r7, [r0, #40] + STRD r8, r9, [r0, #72] + SBFX r3, r2, #7, #1 + EOR r12, r2, r3 + SUB r12, r12, r3 + MOV r4, #0x0 + MOV r5, #0x0 + MOV r6, #0x0 + MOV r7, #0x0 + MOV r8, #0x0 + MOV r9, #0x0 + MOV r3, #0x80000000 + ROR r3, r3, #31 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #30 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #29 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #28 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #27 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #26 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #25 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #24 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #48] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #80] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + SUB r1, r1, #0x2a0 + MVN r10, #0x0 + MVN r11, #0x0 + RSBS lr, lr, #0x0 + SBCS r10, r10, r8 + SBCS r11, r11, r9 + SBC lr, lr, lr + ASR r12, r2, #31 + EOR r3, r4, r6 + AND r3, r3, r12 + EOR r4, r4, r3 + EOR r6, r6, r3 + EOR r3, r5, r7 + AND r3, r3, r12 + EOR r5, r5, r3 + EOR r7, r7, r3 + EOR r10, r10, r8 + AND r10, r10, r12 + EOR r8, r8, r10 + EOR r11, r11, r9 + AND r11, r11, r12 + EOR r9, r9, r11 + STRD r4, r5, [r0, #16] + STRD r6, r7, [r0, #48] + STRD r8, r9, [r0, #80] + SBFX r3, r2, #7, #1 + EOR r12, r2, r3 + SUB r12, r12, r3 + MOV r4, #0x0 + MOV r5, #0x0 + MOV r6, #0x0 + MOV r7, #0x0 + MOV r8, #0x0 + MOV r9, #0x0 + MOV r3, #0x80000000 + ROR r3, r3, #31 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #30 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #29 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #28 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #27 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #26 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #25 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + ADD r1, r1, #0x60 + MOV r3, #0x80000000 + ROR r3, r3, #24 + ROR r3, r3, r12 + ASR r3, r3, #31 + LDRD r10, r11, [r1, #24] + EOR r10, r10, r4 + EOR r11, r11, r5 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r4, r4, r10 + EOR r5, r5, r11 + LDRD r10, r11, [r1, #56] + EOR r10, r10, r6 + EOR r11, r11, r7 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r6, r6, r10 + EOR r7, r7, r11 + LDRD r10, r11, [r1, #88] + EOR r10, r10, r8 + EOR r11, r11, r9 + AND r10, r10, r3 + AND r11, r11, r3 + EOR r8, r8, r10 + EOR r9, r9, r11 + SUB r1, r1, #0x2a0 + MVN r10, #0x0 + MVN r11, #0x80000000 + RSBS lr, lr, #0x0 + SBCS r10, r10, r8 + SBC r11, r11, r9 + ASR r12, r2, #31 + EOR r3, r4, r6 + AND r3, r3, r12 + EOR r4, r4, r3 + EOR r6, r6, r3 + EOR r3, r5, r7 + AND r3, r3, r12 + EOR r5, r5, r3 + EOR r7, r7, r3 + EOR r10, r10, r8 + AND r10, r10, r12 + EOR r8, r8, r10 + EOR r11, r11, r9 + AND r11, r11, r12 + EOR r9, r9, r11 + STRD r4, r5, [r0, #24] + STRD r6, r7, [r0, #56] + STRD r8, r9, [r0, #88] + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 1195 + .size fe_cmov_table,.-fe_cmov_table +#else + .text + .align 4 + .globl fe_cmov_table + .type fe_cmov_table, %function +fe_cmov_table: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SXTB r2, r2 + SBFX r3, r2, #7, #1 + EOR r2, r2, r3 + SUB r2, r2, r3 + CLZ lr, r2 + LSL lr, lr, #26 + ASR lr, lr, #31 + MVN lr, lr + ADD r2, r2, lr + MOV r12, #0x60 + MUL r2, r2, r12 + ADD r1, r1, r2 + LDM r1!, {r4, r5, r6, r7, r8, r9, r10, r11} + AND r4, r4, lr + AND r5, r5, lr + AND r6, r6, lr + AND r7, r7, lr + AND r8, r8, lr + AND r9, r9, lr + AND r10, r10, lr + AND r11, r11, lr + MVN r12, lr + SUB r4, r4, r12 + MOV r12, #0x20 + AND r12, r12, r3 + ADD r0, r0, r12 + STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + SUB r0, r0, r12 + LDM r1!, {r4, r5, r6, r7, r8, r9, r10, r11} + AND r4, r4, lr + AND r5, r5, lr + AND r6, r6, lr + AND r7, r7, lr + AND r8, r8, lr + AND r9, r9, lr + AND r10, r10, lr + AND r11, r11, lr + MVN r12, lr + SUB r4, r4, r12 + MOV r12, #0x20 + BIC r12, r12, r3 + ADD r0, r0, r12 + STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + SUB r0, r0, r12 + ADD r0, r0, #0x40 + LDM r1!, {r4, r5, r6, r7} + MVN r12, #0x12 + SUBS r8, r12, r4 + SBCS r9, r3, r5 + SBCS r10, r3, r6 + SBCS r11, r3, r7 + BIC r4, r4, r3 + BIC r5, r5, r3 + BIC r6, r6, r3 + BIC r7, r7, r3 + AND r8, r8, r3 + AND r9, r9, r3 + AND r10, r10, r3 + AND r11, r11, r3 + ORR r4, r4, r8 + ORR r5, r5, r9 + ORR r6, r6, r10 + ORR r7, r7, r11 + AND r4, r4, lr + AND r5, r5, lr + AND r6, r6, lr + AND r7, r7, lr + STM r0!, {r4, r5, r6, r7} + LDM r1!, {r4, r5, r6, r7} + MVN r12, #0x80000000 + SBCS r8, r3, r4 + SBCS r9, r3, r5 + SBCS r10, r3, r6 + SBC r11, r12, r7 + BIC r4, r4, r3 + BIC r5, r5, r3 + BIC r6, r6, r3 + BIC r7, r7, r3 + AND r8, r8, r3 + AND r9, r9, r3 + AND r10, r10, r3 + AND r11, r11, r3 + ORR r4, r4, r8 + ORR r5, r5, r9 + ORR r6, r6, r10 + ORR r7, r7, r11 + AND r4, r4, lr + AND r5, r5, lr + AND r6, r6, lr + AND r7, r7, lr + STM r0!, {r4, r5, r6, r7} + SUB r1, r1, r2 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 160 + .size fe_cmov_table,.-fe_cmov_table +#endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_ED25519 */ + .text + .align 4 + .globl fe_mul_op + .type fe_mul_op, %function +fe_mul_op: + PUSH {lr} + SUB sp, sp, #0x2c + STRD r0, r1, [sp, #36] + MOV lr, r2 + LDM r1, {r0, r1, r2, r3} + LDM lr!, {r4, r5, r6} + UMULL r10, r11, r0, r4 + UMULL r12, r7, r1, r4 + UMAAL r11, r12, r0, r5 + UMULL r8, r9, r2, r4 + UMAAL r12, r8, r1, r5 + UMAAL r12, r7, r0, r6 + UMAAL r8, r9, r3, r4 + STM sp, {r10, r11, r12} + UMAAL r7, r8, r2, r5 + LDM lr!, {r4} + UMULL r10, r11, r1, r6 + UMAAL r8, r9, r2, r6 + UMAAL r7, r10, r0, r4 + UMAAL r8, r11, r3, r5 + STR r7, [sp, #12] + UMAAL r8, r10, r1, r4 + UMAAL r9, r11, r3, r6 + UMAAL r9, r10, r2, r4 + UMAAL r10, r11, r3, r4 + LDM lr, {r4, r5, r6, r7} + MOV r12, #0x0 + UMLAL r8, r12, r0, r4 + UMAAL r9, r12, r1, r4 + UMAAL r10, r12, r2, r4 + UMAAL r11, r12, r3, r4 + MOV r4, #0x0 + UMLAL r9, r4, r0, r5 + UMAAL r10, r4, r1, r5 + UMAAL r11, r4, r2, r5 + UMAAL r12, r4, r3, r5 + MOV r5, #0x0 + UMLAL r10, r5, r0, r6 + UMAAL r11, r5, r1, r6 + UMAAL r12, r5, r2, r6 + UMAAL r4, r5, r3, r6 + MOV r6, #0x0 + UMLAL r11, r6, r0, r7 + LDR r0, [sp, #40] + UMAAL r12, r6, r1, r7 + ADD r0, r0, #0x10 + UMAAL r4, r6, r2, r7 + SUB lr, lr, #0x10 + UMAAL r5, r6, r3, r7 + LDM r0, {r0, r1, r2, r3} + STR r6, [sp, #32] + LDM lr!, {r6} + MOV r7, #0x0 + UMLAL r8, r7, r0, r6 + UMAAL r9, r7, r1, r6 + STR r8, [sp, #16] + UMAAL r10, r7, r2, r6 + UMAAL r11, r7, r3, r6 + LDM lr!, {r6} + MOV r8, #0x0 + UMLAL r9, r8, r0, r6 + UMAAL r10, r8, r1, r6 + STR r9, [sp, #20] + UMAAL r11, r8, r2, r6 + UMAAL r12, r8, r3, r6 + LDM lr!, {r6} + MOV r9, #0x0 + UMLAL r10, r9, r0, r6 + UMAAL r11, r9, r1, r6 + STR r10, [sp, #24] + UMAAL r12, r9, r2, r6 + UMAAL r4, r9, r3, r6 + LDM lr!, {r6} + MOV r10, #0x0 + UMLAL r11, r10, r0, r6 + UMAAL r12, r10, r1, r6 + STR r11, [sp, #28] + UMAAL r4, r10, r2, r6 + UMAAL r5, r10, r3, r6 + LDM lr!, {r11} + UMAAL r12, r7, r0, r11 + UMAAL r4, r7, r1, r11 + LDR r6, [sp, #32] + UMAAL r5, r7, r2, r11 + UMAAL r6, r7, r3, r11 + LDM lr!, {r11} + UMAAL r4, r8, r0, r11 + UMAAL r5, r8, r1, r11 + UMAAL r6, r8, r2, r11 + UMAAL r7, r8, r3, r11 + LDM lr, {r11, lr} + UMAAL r5, r9, r0, r11 + UMAAL r6, r10, r0, lr + UMAAL r6, r9, r1, r11 + UMAAL r7, r10, r1, lr + UMAAL r7, r9, r2, r11 + UMAAL r8, r10, r2, lr + UMAAL r8, r9, r3, r11 + UMAAL r9, r10, r3, lr + # Reduce + LDR r0, [sp, #28] + MOV lr, #0x25 + UMAAL r10, r0, r10, lr + MOV lr, #0x13 + LSL r0, r0, #1 + ORR r0, r0, r10, LSR #31 + MUL r11, r0, lr + POP {r0, r1, r2} + MOV lr, #0x26 + UMAAL r0, r11, r12, lr + UMAAL r1, r11, r4, lr + UMAAL r2, r11, r5, lr + POP {r3, r4, r5} + UMAAL r3, r11, r6, lr + UMAAL r4, r11, r7, lr + UMAAL r5, r11, r8, lr + POP {r6} + BFC r10, #31, #1 + UMAAL r6, r11, r9, lr + ADD r7, r10, r11 + LDR lr, [sp, #8] + # Store + STM lr, {r0, r1, r2, r3, r4, r5, r6, r7} + ADD sp, sp, #0x10 + POP {pc} + # Cycle Count = 239 + .size fe_mul_op,.-fe_mul_op + .text + .align 4 + .globl fe_mul + .type fe_mul, %function +fe_mul: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + BL fe_mul_op + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 24 + .size fe_mul,.-fe_mul + .text + .align 4 + .globl fe_sq_op + .type fe_sq_op, %function +fe_sq_op: + PUSH {lr} + SUB sp, sp, #0x20 + STR r0, [sp, #28] + LDM r1, {r0, r1, r2, r3, r4, r5, r6, r7} + # Square + UMULL r9, r10, r0, r0 + UMULL r11, r12, r0, r1 + ADDS r11, r11, r11 + MOV lr, #0x0 + UMAAL r10, r11, lr, lr + STM sp, {r9, r10} + MOV r8, lr + UMAAL r8, r12, r0, r2 + ADCS r8, r8, r8 + UMAAL r8, r11, r1, r1 + UMULL r9, r10, r0, r3 + UMAAL r9, r12, r1, r2 + ADCS r9, r9, r9 + UMAAL r9, r11, lr, lr + STRD r8, r9, [sp, #8] + MOV r9, lr + UMAAL r9, r10, r0, r4 + UMAAL r9, r12, r1, r3 + ADCS r9, r9, r9 + UMAAL r9, r11, r2, r2 + STR r9, [sp, #16] + UMULL r9, r8, r0, r5 + UMAAL r9, r12, r1, r4 + UMAAL r9, r10, r2, r3 + ADCS r9, r9, r9 + UMAAL r9, r11, lr, lr + STR r9, [sp, #20] + MOV r9, lr + UMAAL r9, r8, r0, r6 + UMAAL r9, r12, r1, r5 + UMAAL r9, r10, r2, r4 + ADCS r9, r9, r9 + UMAAL r9, r11, r3, r3 + STR r9, [sp, #24] + UMULL r0, r9, r0, r7 + UMAAL r0, r8, r1, r6 + UMAAL r0, r12, r2, r5 + UMAAL r0, r10, r3, r4 + ADCS r0, r0, r0 + UMAAL r0, r11, lr, lr + # R[7] = r0 + UMAAL r9, r8, r1, r7 + UMAAL r9, r10, r2, r6 + UMAAL r12, r9, r3, r5 + ADCS r12, r12, r12 + UMAAL r12, r11, r4, r4 + # R[8] = r12 + UMAAL r9, r8, r2, r7 + UMAAL r10, r9, r3, r6 + MOV r2, lr + UMAAL r10, r2, r4, r5 + ADCS r10, r10, r10 + UMAAL r11, r10, lr, lr + # R[9] = r11 + UMAAL r2, r8, r3, r7 + UMAAL r2, r9, r4, r6 + ADCS r3, r2, r2 + UMAAL r10, r3, r5, r5 + # R[10] = r10 + MOV r1, lr + UMAAL r1, r8, r4, r7 + UMAAL r1, r9, r5, r6 + ADCS r4, r1, r1 + UMAAL r3, r4, lr, lr + # R[11] = r3 + UMAAL r8, r9, r5, r7 + ADCS r8, r8, r8 + UMAAL r4, r8, r6, r6 + # R[12] = r4 + MOV r5, lr + UMAAL r5, r9, r6, r7 + ADCS r5, r5, r5 + UMAAL r8, r5, lr, lr + # R[13] = r8 + ADCS r9, r9, r9 + UMAAL r9, r5, r7, r7 + ADCS r7, r5, lr + # R[14] = r9 + # R[15] = r7 + # Reduce + MOV r6, #0x25 + UMAAL r7, r0, r7, r6 + MOV r6, #0x13 + LSL r0, r0, #1 + ORR r0, r0, r7, LSR #31 + MUL lr, r0, r6 + POP {r0, r1} + MOV r6, #0x26 + UMAAL r0, lr, r12, r6 + UMAAL r1, lr, r11, r6 + MOV r12, r3 + MOV r11, r4 + POP {r2, r3, r4} + UMAAL r2, lr, r10, r6 + UMAAL r3, lr, r12, r6 + UMAAL r4, lr, r11, r6 + MOV r12, r6 + POP {r5, r6} + UMAAL r5, lr, r8, r12 + BFC r7, #31, #1 + UMAAL r6, lr, r9, r12 + ADD r7, r7, lr + POP {lr} + # Store + STM lr, {r0, r1, r2, r3, r4, r5, r6, r7} + POP {pc} + # Cycle Count = 179 + .size fe_sq_op,.-fe_sq_op + .text + .align 4 + .globl fe_sq + .type fe_sq, %function +fe_sq: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + BL fe_sq_op + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 24 + .size fe_sq,.-fe_sq + .text + .align 4 + .globl fe_mul121666 + .type fe_mul121666, %function +fe_mul121666: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + # Multiply by 121666 + LDM r1, {r2, r3, r4, r5, r6, r7, r8, r9} + MOV r11, #0xdb42 + MOVT r11, #0x1 + UMULL r2, r12, r2, r11 + SUB r10, r11, #0x1 + UMAAL r3, r12, r3, r10 + UMAAL r4, r12, r4, r10 + UMAAL r5, r12, r5, r10 + UMAAL r6, r12, r6, r10 + UMAAL r7, r12, r7, r10 + UMAAL r8, r12, r8, r10 + MOV r11, #0x13 + UMAAL r9, r12, r9, r10 + LSL r12, r12, #1 + ORR r12, r12, r9, LSR #31 + MUL r12, r12, r11 + ADDS r2, r2, r12 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + BFC r9, #31, #1 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 69 + .size fe_mul121666,.-fe_mul121666 +#ifndef WC_NO_CACHE_RESISTANT + .text + .align 4 + .globl curve25519 + .type curve25519, %function +curve25519: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xbc + STR r0, [sp, #160] + STR r1, [sp, #164] + STR r2, [sp, #168] + MOV r1, #0x0 + STR r1, [sp, #172] + # Set one + MOV r10, #0x1 + MOV r11, #0x0 + STRD r10, r11, [r0] + MOV r10, #0x0 + STRD r10, r11, [r0, #8] + STRD r10, r11, [r0, #16] + STRD r10, r11, [r0, #24] + # Set zero + MOV r10, #0x0 + MOV r11, #0x0 + STRD r10, r11, [sp] + STRD r10, r11, [sp, #8] + STRD r10, r11, [sp, #16] + STRD r10, r11, [sp, #24] + # Set one + MOV r10, #0x1 + MOV r11, #0x0 + STRD r10, r11, [sp, #32] + MOV r10, #0x0 + STRD r10, r11, [sp, #40] + STRD r10, r11, [sp, #48] + STRD r10, r11, [sp, #56] + ADD r3, sp, #0x40 + # Copy + LDM r2, {r4, r5, r6, r7, r8, r9, r10, r11} + STM r3, {r4, r5, r6, r7, r8, r9, r10, r11} + MOV r1, #0x1e + STR r1, [sp, #180] + MOV r2, #0x1c + STR r2, [sp, #176] +L_curve25519_words: +L_curve25519_bits: + LDR r1, [sp, #164] + LDR r2, [r1, r2] + LDR r1, [sp, #180] + LSR r2, r2, r1 + AND r2, r2, #0x1 + STR r2, [sp, #184] + LDR r1, [sp, #172] + EOR r1, r1, r2 + STR r1, [sp, #172] + LDR r0, [sp, #160] + # Conditional Swap + RSB r1, r1, #0x0 + LDRD r4, r5, [r0] + LDRD r6, r7, [sp, #64] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [r0] + STRD r6, r7, [sp, #64] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [sp, #72] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [r0, #8] + STRD r6, r7, [sp, #72] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [sp, #80] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [r0, #16] + STRD r6, r7, [sp, #80] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [sp, #88] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [r0, #24] + STRD r6, r7, [sp, #88] + LDR r1, [sp, #172] + # Conditional Swap + RSB r1, r1, #0x0 + LDRD r4, r5, [sp] + LDRD r6, r7, [sp, #32] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [sp] + STRD r6, r7, [sp, #32] + LDRD r4, r5, [sp, #8] + LDRD r6, r7, [sp, #40] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [sp, #8] + STRD r6, r7, [sp, #40] + LDRD r4, r5, [sp, #16] + LDRD r6, r7, [sp, #48] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [sp, #16] + STRD r6, r7, [sp, #48] + LDRD r4, r5, [sp, #24] + LDRD r6, r7, [sp, #56] + EOR r8, r4, r6 + EOR r9, r5, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r9 + EOR r6, r6, r8 + EOR r7, r7, r9 + STRD r4, r5, [sp, #24] + STRD r6, r7, [sp, #56] + LDR r1, [sp, #184] + STR r1, [sp, #172] + MOV r3, sp + LDR r2, [sp, #160] + ADD r1, sp, #0x80 + LDR r0, [sp, #160] + BL fe_add_sub_op + ADD r3, sp, #0x20 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + MOV r0, sp + BL fe_add_sub_op + LDR r2, [sp, #160] + ADD r1, sp, #0x60 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r2, sp, #0x80 + MOV r1, sp + MOV r0, sp + BL fe_mul_op + ADD r1, sp, #0x80 + ADD r0, sp, #0x80 + BL fe_sq_op + LDR r1, [sp, #160] + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r3, sp + ADD r2, sp, #0x20 + MOV r1, sp + ADD r0, sp, #0x40 + BL fe_add_sub_op + ADD r2, sp, #0x80 + ADD r1, sp, #0x60 + LDR r0, [sp, #160] + BL fe_mul_op + ADD r2, sp, #0x80 + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + BL fe_sub_op + MOV r1, sp + MOV r0, sp + BL fe_sq_op + ADD r1, sp, #0x60 + ADD r0, sp, #0x20 + BL fe_mul121666 + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + BL fe_sq_op + ADD r2, sp, #0x20 + ADD r1, sp, #0x80 + ADD r0, sp, #0x80 + BL fe_add_op + MOV r2, sp + LDR r1, [sp, #168] + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r2, sp, #0x80 + ADD r1, sp, #0x60 + MOV r0, sp + BL fe_mul_op + LDR r2, [sp, #176] + LDR r1, [sp, #180] + SUBS r1, r1, #0x1 + STR r1, [sp, #180] + BGE L_curve25519_bits + MOV r1, #0x1f + STR r1, [sp, #180] + SUBS r2, r2, #0x4 + STR r2, [sp, #176] + BGE L_curve25519_words + # Invert + ADD r1, sp, #0x0 + ADD r0, sp, #0x20 + BL fe_sq_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x40 + BL fe_sq_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + BL fe_sq_op + ADD r2, sp, #0x40 + ADD r1, sp, #0x0 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r2, sp, #0x40 + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x60 + BL fe_sq_op + ADD r2, sp, #0x60 + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x4 +L_curve25519_inv_1: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_1 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x9 +L_curve25519_inv_2: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_2 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + BL fe_mul_op + ADD r1, sp, #0x60 + ADD r0, sp, #0x80 + BL fe_sq_op + MOV r12, #0x13 +L_curve25519_inv_3: + ADD r1, sp, #0x80 + ADD r0, sp, #0x80 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_3 + ADD r2, sp, #0x60 + ADD r1, sp, #0x80 + ADD r0, sp, #0x60 + BL fe_mul_op + MOV r12, #0xa +L_curve25519_inv_4: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_4 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x31 +L_curve25519_inv_5: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_5 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + BL fe_mul_op + ADD r1, sp, #0x60 + ADD r0, sp, #0x80 + BL fe_sq_op + MOV r12, #0x63 +L_curve25519_inv_6: + ADD r1, sp, #0x80 + ADD r0, sp, #0x80 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_6 + ADD r2, sp, #0x60 + ADD r1, sp, #0x80 + ADD r0, sp, #0x60 + BL fe_mul_op + MOV r12, #0x32 +L_curve25519_inv_7: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_7 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + MOV r12, #0x5 +L_curve25519_inv_8: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_8 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x0 + BL fe_mul_op + MOV r2, sp + LDR r1, [sp, #160] + LDR r0, [sp, #160] + BL fe_mul_op + MOV r0, #0x0 + ADD sp, sp, #0xbc + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 684 + .size curve25519,.-curve25519 +#else + .text + .align 4 + .globl curve25519 + .type curve25519, %function +curve25519: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xc0 + STR r0, [sp, #176] + STR r1, [sp, #160] + STR r2, [sp, #172] + ADD r5, sp, #0x40 + ADD r4, sp, #0x20 + STR sp, [sp, #184] + STR r5, [sp, #180] + STR r4, [sp, #188] + MOV r1, #0x0 + STR r1, [sp, #164] + # Set one + MOV r10, #0x1 + MOV r11, #0x0 + STRD r10, r11, [r0] + MOV r10, #0x0 + STRD r10, r11, [r0, #8] + STRD r10, r11, [r0, #16] + STRD r10, r11, [r0, #24] + # Set zero + MOV r10, #0x0 + MOV r11, #0x0 + STRD r10, r11, [sp] + STRD r10, r11, [sp, #8] + STRD r10, r11, [sp, #16] + STRD r10, r11, [sp, #24] + # Set one + MOV r10, #0x1 + MOV r11, #0x0 + STRD r10, r11, [sp, #32] + MOV r10, #0x0 + STRD r10, r11, [sp, #40] + STRD r10, r11, [sp, #48] + STRD r10, r11, [sp, #56] + ADD r3, sp, #0x40 + # Copy + LDM r2, {r4, r5, r6, r7, r8, r9, r10, r11} + STM r3, {r4, r5, r6, r7, r8, r9, r10, r11} + MOV r2, #0xfe +L_curve25519_bits: + STR r2, [sp, #168] + LDR r1, [sp, #160] + AND r4, r2, #0x1f + LSR r2, r2, #5 + LDR r2, [r1, r2, LSL #2] + RSB r4, r4, #0x1f + LSL r2, r2, r4 + LDR r1, [sp, #164] + EOR r1, r1, r2 + ASR r1, r1, #31 + STR r2, [sp, #164] + # Conditional Swap + ADD r11, sp, #0xb0 + LDM r11, {r4, r5, r6, r7} + EOR r8, r4, r5 + EOR r9, r6, r7 + AND r8, r8, r1 + AND r9, r9, r1 + EOR r4, r4, r8 + EOR r5, r5, r8 + EOR r6, r6, r9 + EOR r7, r7, r9 + STM r11, {r4, r5, r6, r7} + # Ladder step + LDR r3, [sp, #184] + LDR r2, [sp, #176] + ADD r1, sp, #0x80 + LDR r0, [sp, #176] + BL fe_add_sub_op + LDR r3, [sp, #188] + LDR r2, [sp, #180] + ADD r1, sp, #0x60 + LDR r0, [sp, #184] + BL fe_add_sub_op + LDR r2, [sp, #176] + ADD r1, sp, #0x60 + LDR r0, [sp, #188] + BL fe_mul_op + ADD r2, sp, #0x80 + LDR r1, [sp, #184] + LDR r0, [sp, #184] + BL fe_mul_op + ADD r1, sp, #0x80 + ADD r0, sp, #0x60 + BL fe_sq_op + LDR r1, [sp, #176] + ADD r0, sp, #0x80 + BL fe_sq_op + LDR r3, [sp, #184] + LDR r2, [sp, #188] + LDR r1, [sp, #184] + LDR r0, [sp, #180] + BL fe_add_sub_op + ADD r2, sp, #0x60 + ADD r1, sp, #0x80 + LDR r0, [sp, #176] + BL fe_mul_op + ADD r2, sp, #0x60 + ADD r1, sp, #0x80 + ADD r0, sp, #0x80 + BL fe_sub_op + LDR r1, [sp, #184] + LDR r0, [sp, #184] + BL fe_sq_op + ADD r1, sp, #0x80 + LDR r0, [sp, #188] + BL fe_mul121666 + LDR r1, [sp, #180] + LDR r0, [sp, #180] + BL fe_sq_op + LDR r2, [sp, #188] + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + BL fe_add_op + LDR r2, [sp, #184] + LDR r1, [sp, #172] + LDR r0, [sp, #188] + BL fe_mul_op + ADD r2, sp, #0x60 + ADD r1, sp, #0x80 + LDR r0, [sp, #184] + BL fe_mul_op + LDR r2, [sp, #168] + SUBS r2, r2, #0x1 + BGE L_curve25519_bits + # Cycle Count: 171 + LDR r1, [sp, #184] + # Copy + LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} + STM sp, {r4, r5, r6, r7, r8, r9, r10, r11} + # Invert + ADD r1, sp, #0x0 + ADD r0, sp, #0x20 + BL fe_sq_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x40 + BL fe_sq_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + BL fe_sq_op + ADD r2, sp, #0x40 + ADD r1, sp, #0x0 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r2, sp, #0x40 + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x60 + BL fe_sq_op + ADD r2, sp, #0x60 + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x4 +L_curve25519_inv_1: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_1 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x9 +L_curve25519_inv_2: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_2 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + BL fe_mul_op + ADD r1, sp, #0x60 + ADD r0, sp, #0x80 + BL fe_sq_op + MOV r12, #0x13 +L_curve25519_inv_3: + ADD r1, sp, #0x80 + ADD r0, sp, #0x80 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_3 + ADD r2, sp, #0x60 + ADD r1, sp, #0x80 + ADD r0, sp, #0x60 + BL fe_mul_op + MOV r12, #0xa +L_curve25519_inv_4: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_4 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x31 +L_curve25519_inv_5: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_5 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + BL fe_mul_op + ADD r1, sp, #0x60 + ADD r0, sp, #0x80 + BL fe_sq_op + MOV r12, #0x63 +L_curve25519_inv_6: + ADD r1, sp, #0x80 + ADD r0, sp, #0x80 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_6 + ADD r2, sp, #0x60 + ADD r1, sp, #0x80 + ADD r0, sp, #0x60 + BL fe_mul_op + MOV r12, #0x32 +L_curve25519_inv_7: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_7 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + MOV r12, #0x5 +L_curve25519_inv_8: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_curve25519_inv_8 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x0 + BL fe_mul_op + LDR r2, [sp, #184] + LDR r1, [sp, #176] + LDR r0, [sp, #176] + BL fe_mul_op + # Ensure result is less than modulus + LDR r0, [sp, #176] + LDM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + MOV r2, #0x13 + AND r2, r2, r11, ASR #31 + ADDS r4, r4, r2 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADCS r9, r9, #0x0 + BFC r11, #31, #1 + ADCS r10, r10, #0x0 + ADC r11, r11, #0x0 + STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + MOV r0, #0x0 + ADD sp, sp, #0xc0 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 595 + .size curve25519,.-curve25519 +#endif /* WC_NO_CACHE_RESISTANT */ +#ifdef HAVE_ED25519 + .text + .align 4 + .globl fe_invert + .type fe_invert, %function +fe_invert: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x88 + # Invert + STR r0, [sp, #128] + STR r1, [sp, #132] + LDR r1, [sp, #132] + MOV r0, sp + BL fe_sq_op + MOV r1, sp + ADD r0, sp, #0x20 + BL fe_sq_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + BL fe_sq_op + ADD r2, sp, #0x20 + LDR r1, [sp, #132] + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r2, sp, #0x20 + MOV r1, sp + MOV r0, sp + BL fe_mul_op + MOV r1, sp + ADD r0, sp, #0x40 + BL fe_sq_op + ADD r2, sp, #0x40 + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x40 + BL fe_sq_op + MOV r12, #0x4 +L_fe_invert1: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert1 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x40 + BL fe_sq_op + MOV r12, #0x9 +L_fe_invert2: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert2 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x13 +L_fe_invert3: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert3 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + MOV r12, #0xa +L_fe_invert4: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert4 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x40 + BL fe_sq_op + MOV r12, #0x31 +L_fe_invert5: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert5 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + BL fe_mul_op + ADD r1, sp, #0x40 + ADD r0, sp, #0x60 + BL fe_sq_op + MOV r12, #0x63 +L_fe_invert6: + ADD r1, sp, #0x60 + ADD r0, sp, #0x60 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert6 + ADD r2, sp, #0x40 + ADD r1, sp, #0x60 + ADD r0, sp, #0x40 + BL fe_mul_op + MOV r12, #0x32 +L_fe_invert7: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert7 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x20 + BL fe_mul_op + MOV r12, #0x5 +L_fe_invert8: + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_invert8 + MOV r2, sp + ADD r1, sp, #0x20 + LDR r0, [sp, #128] + BL fe_mul_op + LDR r1, [sp, #132] + LDR r0, [sp, #128] + ADD sp, sp, #0x88 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 292 + .size fe_invert,.-fe_invert + .text + .align 4 + .globl fe_sq2 + .type fe_sq2, %function +fe_sq2: + PUSH {lr} + SUB sp, sp, #0x24 + STRD r0, r1, [sp, #28] + LDM r1, {r0, r1, r2, r3, r4, r5, r6, r7} + # Square * 2 + UMULL r9, r10, r0, r0 + UMULL r11, r12, r0, r1 + ADDS r11, r11, r11 + MOV lr, #0x0 + UMAAL r10, r11, lr, lr + STM sp, {r9, r10} + MOV r8, lr + UMAAL r8, r12, r0, r2 + ADCS r8, r8, r8 + UMAAL r8, r11, r1, r1 + UMULL r9, r10, r0, r3 + UMAAL r9, r12, r1, r2 + ADCS r9, r9, r9 + UMAAL r9, r11, lr, lr + STRD r8, r9, [sp, #8] + MOV r9, lr + UMAAL r9, r10, r0, r4 + UMAAL r9, r12, r1, r3 + ADCS r9, r9, r9 + UMAAL r9, r11, r2, r2 + STR r9, [sp, #16] + UMULL r9, r8, r0, r5 + UMAAL r9, r12, r1, r4 + UMAAL r9, r10, r2, r3 + ADCS r9, r9, r9 + UMAAL r9, r11, lr, lr + STR r9, [sp, #20] + MOV r9, lr + UMAAL r9, r8, r0, r6 + UMAAL r9, r12, r1, r5 + UMAAL r9, r10, r2, r4 + ADCS r9, r9, r9 + UMAAL r9, r11, r3, r3 + STR r9, [sp, #24] + UMULL r0, r9, r0, r7 + UMAAL r0, r8, r1, r6 + UMAAL r0, r12, r2, r5 + UMAAL r0, r10, r3, r4 + ADCS r0, r0, r0 + UMAAL r0, r11, lr, lr + # R[7] = r0 + UMAAL r9, r8, r1, r7 + UMAAL r9, r10, r2, r6 + UMAAL r12, r9, r3, r5 + ADCS r12, r12, r12 + UMAAL r12, r11, r4, r4 + # R[8] = r12 + UMAAL r9, r8, r2, r7 + UMAAL r10, r9, r3, r6 + MOV r2, lr + UMAAL r10, r2, r4, r5 + ADCS r10, r10, r10 + UMAAL r11, r10, lr, lr + # R[9] = r11 + UMAAL r2, r8, r3, r7 + UMAAL r2, r9, r4, r6 + ADCS r3, r2, r2 + UMAAL r10, r3, r5, r5 + # R[10] = r10 + MOV r1, lr + UMAAL r1, r8, r4, r7 + UMAAL r1, r9, r5, r6 + ADCS r4, r1, r1 + UMAAL r3, r4, lr, lr + # R[11] = r3 + UMAAL r8, r9, r5, r7 + ADCS r8, r8, r8 + UMAAL r4, r8, r6, r6 + # R[12] = r4 + MOV r5, lr + UMAAL r5, r9, r6, r7 + ADCS r5, r5, r5 + UMAAL r8, r5, lr, lr + # R[13] = r8 + ADCS r9, r9, r9 + UMAAL r9, r5, r7, r7 + ADCS r7, r5, lr + # R[14] = r9 + # R[15] = r7 + # Reduce + MOV r6, #0x25 + UMAAL r7, r0, r7, r6 + MOV r6, #0x13 + LSL r0, r0, #1 + ORR r0, r0, r7, LSR #31 + MUL lr, r0, r6 + POP {r0, r1} + MOV r6, #0x26 + UMAAL r0, lr, r12, r6 + UMAAL r1, lr, r11, r6 + MOV r12, r3 + MOV r11, r4 + POP {r2, r3, r4} + UMAAL r2, lr, r10, r6 + UMAAL r3, lr, r12, r6 + UMAAL r4, lr, r11, r6 + MOV r12, r6 + POP {r5, r6} + UMAAL r5, lr, r8, r12 + BFC r7, #31, #1 + UMAAL r6, lr, r9, r12 + ADD r7, r7, lr + # Reduce if top bit set + MOV r11, #0x13 + AND r12, r11, r7, ASR #31 + ADDS r0, r0, r12 + ADCS r1, r1, #0x0 + ADCS r2, r2, #0x0 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + BFC r7, #31, #1 + ADCS r6, r6, #0x0 + ADC r7, r7, #0x0 + # Double + ADDS r0, r0, r0 + ADCS r1, r1, r1 + ADCS r2, r2, r2 + ADCS r3, r3, r3 + ADCS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADC r7, r7, r7 + # Reduce if top bit set + MOV r11, #0x13 + AND r12, r11, r7, ASR #31 + ADDS r0, r0, r12 + ADCS r1, r1, #0x0 + ADCS r2, r2, #0x0 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + BFC r7, #31, #1 + ADCS r6, r6, #0x0 + ADC r7, r7, #0x0 + POP {r12, lr} + # Store + STM r12, {r0, r1, r2, r3, r4, r5, r6, r7} + MOV r0, r12 + MOV r1, lr + POP {pc} + # Cycle Count = 213 + .size fe_sq2,.-fe_sq2 + .text + .align 4 + .globl fe_pow22523 + .type fe_pow22523, %function +fe_pow22523: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x68 + # pow22523 + STR r0, [sp, #96] + STR r1, [sp, #100] + LDR r1, [sp, #100] + MOV r0, sp + BL fe_sq_op + MOV r1, sp + ADD r0, sp, #0x20 + BL fe_sq_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + BL fe_sq_op + ADD r2, sp, #0x20 + LDR r1, [sp, #100] + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r2, sp, #0x20 + MOV r1, sp + MOV r0, sp + BL fe_mul_op + MOV r1, sp + MOV r0, sp + BL fe_sq_op + MOV r2, sp + ADD r1, sp, #0x20 + MOV r0, sp + BL fe_mul_op + MOV r1, sp + ADD r0, sp, #0x20 + BL fe_sq_op + MOV r12, #0x4 +L_fe_pow22523_1: + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_1 + MOV r2, sp + ADD r1, sp, #0x20 + MOV r0, sp + BL fe_mul_op + MOV r1, sp + ADD r0, sp, #0x20 + BL fe_sq_op + MOV r12, #0x9 +L_fe_pow22523_2: + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_2 + MOV r2, sp + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x40 + BL fe_sq_op + MOV r12, #0x13 +L_fe_pow22523_3: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_3 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x20 + BL fe_mul_op + MOV r12, #0xa +L_fe_pow22523_4: + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_4 + MOV r2, sp + ADD r1, sp, #0x20 + MOV r0, sp + BL fe_mul_op + MOV r1, sp + ADD r0, sp, #0x20 + BL fe_sq_op + MOV r12, #0x31 +L_fe_pow22523_5: + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_5 + MOV r2, sp + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + BL fe_mul_op + ADD r1, sp, #0x20 + ADD r0, sp, #0x40 + BL fe_sq_op + MOV r12, #0x63 +L_fe_pow22523_6: + ADD r1, sp, #0x40 + ADD r0, sp, #0x40 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_6 + ADD r2, sp, #0x20 + ADD r1, sp, #0x40 + ADD r0, sp, #0x20 + BL fe_mul_op + MOV r12, #0x32 +L_fe_pow22523_7: + ADD r1, sp, #0x20 + ADD r0, sp, #0x20 + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_7 + MOV r2, sp + ADD r1, sp, #0x20 + MOV r0, sp + BL fe_mul_op + MOV r12, #0x2 +L_fe_pow22523_8: + MOV r1, sp + MOV r0, sp + PUSH {r12} + BL fe_sq_op + POP {r12} + SUBS r12, r12, #0x1 + BNE L_fe_pow22523_8 + LDR r2, [sp, #100] + MOV r1, sp + LDR r0, [sp, #96] + BL fe_mul_op + LDR r1, [sp, #100] + LDR r0, [sp, #96] + ADD sp, sp, #0x68 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 293 + .size fe_pow22523,.-fe_pow22523 + .text + .align 4 + .globl ge_p1p1_to_p2 + .type ge_p1p1_to_p2, %function +ge_p1p1_to_p2: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x8 + STR r0, [sp] + STR r1, [sp, #4] + ADD r2, r1, #0x60 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #4] + ADD r2, r1, #0x40 + ADD r1, r1, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #4] + ADD r2, r1, #0x60 + ADD r1, r1, #0x40 + ADD r0, r0, #0x40 + BL fe_mul_op + ADD sp, sp, #0x8 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 53 + .size ge_p1p1_to_p2,.-ge_p1p1_to_p2 + .text + .align 4 + .globl ge_p1p1_to_p3 + .type ge_p1p1_to_p3, %function +ge_p1p1_to_p3: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x8 + STR r0, [sp] + STR r1, [sp, #4] + ADD r2, r1, #0x60 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #4] + ADD r2, r1, #0x40 + ADD r1, r1, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #4] + ADD r2, r1, #0x60 + ADD r1, r1, #0x40 + ADD r0, r0, #0x40 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #4] + ADD r2, r1, #0x20 + ADD r0, r0, #0x60 + BL fe_mul_op + ADD sp, sp, #0x8 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 63 + .size ge_p1p1_to_p3,.-ge_p1p1_to_p3 + .text + .align 4 + .globl ge_p2_dbl + .type ge_p2_dbl, %function +ge_p2_dbl: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x8 + STR r0, [sp] + STR r1, [sp, #4] + BL fe_sq_op + LDR r0, [sp] + LDR r1, [sp, #4] + ADD r1, r1, #0x20 + ADD r0, r0, #0x40 + BL fe_sq_op + LDR r0, [sp] + LDR r1, [sp, #4] + ADD r2, r1, #0x20 + ADD r0, r0, #0x20 + BL fe_add_op + MOV r1, r0 + ADD r0, r0, #0x40 + BL fe_sq_op + LDR r0, [sp] + MOV r3, r0 + ADD r2, r0, #0x40 + ADD r1, r0, #0x40 + ADD r0, r0, #0x20 + BL fe_add_sub_op + MOV r2, r0 + ADD r1, r0, #0x40 + SUB r0, r0, #0x20 + BL fe_sub_op + LDR r1, [sp, #4] + ADD r1, r1, #0x40 + ADD r0, r0, #0x60 + BL fe_sq2 + SUB r2, r0, #0x20 + MOV r1, r0 + BL fe_sub_op + ADD sp, sp, #0x8 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 87 + .size ge_p2_dbl,.-ge_p2_dbl + .text + .align 4 + .globl ge_madd + .type ge_madd, %function +ge_madd: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xc + STR r0, [sp] + STR r1, [sp, #4] + STR r2, [sp, #8] + MOV r2, r1 + ADD r1, r1, #0x20 + BL fe_add_op + LDR r1, [sp, #4] + MOV r2, r1 + ADD r1, r1, #0x20 + ADD r0, r0, #0x20 + BL fe_sub_op + LDR r2, [sp, #8] + SUB r1, r0, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r2, [sp, #8] + ADD r2, r2, #0x20 + ADD r1, r0, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #8] + LDR r2, [sp, #4] + ADD r2, r2, #0x60 + ADD r1, r1, #0x40 + ADD r0, r0, #0x60 + BL fe_mul_op + LDR r0, [sp] + ADD r3, r0, #0x20 + ADD r2, r0, #0x40 + MOV r1, r0 + ADD r0, r0, #0x20 + BL fe_add_sub_op + LDR r1, [sp, #4] + ADD r1, r1, #0x40 + ADD r0, r0, #0x20 + # Double + LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} + ADDS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADCS r10, r10, r10 + MOV lr, #0x0 + ADCS r11, r11, r11 + ADC lr, lr, #0x0 + MOV r12, #0x13 + LSL lr, lr, #1 + ORR lr, lr, r11, LSR #31 + MUL r12, lr, r12 + ADDS r4, r4, r12 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADCS r9, r9, #0x0 + BFC r11, #31, #1 + ADCS r10, r10, #0x0 + ADC r11, r11, #0x0 + STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + ADD r3, r0, #0x20 + ADD r1, r0, #0x20 + BL fe_add_sub_op + ADD sp, sp, #0xc + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 136 + .size ge_madd,.-ge_madd + .text + .align 4 + .globl ge_msub + .type ge_msub, %function +ge_msub: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xc + STR r0, [sp] + STR r1, [sp, #4] + STR r2, [sp, #8] + MOV r2, r1 + ADD r1, r1, #0x20 + BL fe_add_op + LDR r1, [sp, #4] + MOV r2, r1 + ADD r1, r1, #0x20 + ADD r0, r0, #0x20 + BL fe_sub_op + LDR r2, [sp, #8] + ADD r2, r2, #0x20 + SUB r1, r0, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r2, [sp, #8] + ADD r1, r0, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #8] + LDR r2, [sp, #4] + ADD r2, r2, #0x60 + ADD r1, r1, #0x40 + ADD r0, r0, #0x60 + BL fe_mul_op + LDR r0, [sp] + ADD r3, r0, #0x20 + ADD r2, r0, #0x40 + MOV r1, r0 + ADD r0, r0, #0x20 + BL fe_add_sub_op + LDR r1, [sp, #4] + ADD r1, r1, #0x40 + ADD r0, r0, #0x20 + # Double + LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} + ADDS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADCS r10, r10, r10 + MOV lr, #0x0 + ADCS r11, r11, r11 + ADC lr, lr, #0x0 + MOV r12, #0x13 + LSL lr, lr, #1 + ORR lr, lr, r11, LSR #31 + MUL r12, lr, r12 + ADDS r4, r4, r12 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADCS r9, r9, #0x0 + BFC r11, #31, #1 + ADCS r10, r10, #0x0 + ADC r11, r11, #0x0 + STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + ADD r3, r0, #0x20 + MOV r1, r0 + ADD r0, r0, #0x20 + BL fe_add_sub_op + ADD sp, sp, #0xc + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 137 + .size ge_msub,.-ge_msub + .text + .align 4 + .globl ge_add + .type ge_add, %function +ge_add: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x2c + STR r0, [sp] + STR r1, [sp, #4] + STR r2, [sp, #8] + MOV r3, r1 + ADD r2, r1, #0x20 + ADD r1, r0, #0x20 + BL fe_add_sub_op + LDR r2, [sp, #8] + MOV r1, r0 + ADD r0, r0, #0x40 + BL fe_mul_op + LDR r0, [sp] + LDR r2, [sp, #8] + ADD r2, r2, #0x20 + ADD r1, r0, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #8] + LDR r2, [sp, #4] + ADD r2, r2, #0x60 + ADD r1, r1, #0x60 + ADD r0, r0, #0x60 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #4] + LDR r2, [sp, #8] + ADD r2, r2, #0x40 + ADD r1, r1, #0x40 + BL fe_mul_op + LDR r1, [sp] + ADD r0, sp, #0xc + # Double + LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} + ADDS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADCS r10, r10, r10 + MOV lr, #0x0 + ADCS r11, r11, r11 + ADC lr, lr, #0x0 + MOV r12, #0x13 + LSL lr, lr, #1 + ORR lr, lr, r11, LSR #31 + MUL r12, lr, r12 + ADDS r4, r4, r12 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADCS r9, r9, #0x0 + BFC r11, #31, #1 + ADCS r10, r10, #0x0 + ADC r11, r11, #0x0 + STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + ADD r3, r1, #0x20 + ADD r2, r1, #0x40 + ADD r0, r1, #0x20 + BL fe_add_sub_op + ADD r3, r0, #0x40 + ADD r2, sp, #0xc + ADD r1, r0, #0x40 + ADD r0, r0, #0x20 + BL fe_add_sub_op + ADD sp, sp, #0x2c + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 138 + .size ge_add,.-ge_add + .text + .align 4 + .globl ge_sub + .type ge_sub, %function +ge_sub: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x2c + STR r0, [sp] + STR r1, [sp, #4] + STR r2, [sp, #8] + MOV r3, r1 + ADD r2, r1, #0x20 + ADD r1, r0, #0x20 + BL fe_add_sub_op + LDR r2, [sp, #8] + ADD r2, r2, #0x20 + MOV r1, r0 + ADD r0, r0, #0x40 + BL fe_mul_op + LDR r0, [sp] + LDR r2, [sp, #8] + ADD r1, r0, #0x20 + ADD r0, r0, #0x20 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #8] + LDR r2, [sp, #4] + ADD r2, r2, #0x60 + ADD r1, r1, #0x60 + ADD r0, r0, #0x60 + BL fe_mul_op + LDR r0, [sp] + LDR r1, [sp, #4] + LDR r2, [sp, #8] + ADD r2, r2, #0x40 + ADD r1, r1, #0x40 + BL fe_mul_op + LDR r1, [sp] + ADD r0, sp, #0xc + # Double + LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} + ADDS r4, r4, r4 + ADCS r5, r5, r5 + ADCS r6, r6, r6 + ADCS r7, r7, r7 + ADCS r8, r8, r8 + ADCS r9, r9, r9 + ADCS r10, r10, r10 + MOV lr, #0x0 + ADCS r11, r11, r11 + ADC lr, lr, #0x0 + MOV r12, #0x13 + LSL lr, lr, #1 + ORR lr, lr, r11, LSR #31 + MUL r12, lr, r12 + ADDS r4, r4, r12 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADCS r9, r9, #0x0 + BFC r11, #31, #1 + ADCS r10, r10, #0x0 + ADC r11, r11, #0x0 + STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} + # Done Double + ADD r3, r1, #0x20 + ADD r2, r1, #0x40 + ADD r0, r1, #0x20 + BL fe_add_sub_op + ADD r3, r0, #0x40 + ADD r2, sp, #0xc + ADD r1, r0, #0x20 + ADD r0, r0, #0x40 + BL fe_add_sub_op + ADD sp, sp, #0x2c + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 138 + .size ge_sub,.-ge_sub + .text + .align 4 + .globl sc_reduce + .type sc_reduce, %function +sc_reduce: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x34 + # Load bits 252-511 + ADD r0, r0, #0x1c + LDM r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} + LSR lr, r9, #24 + LSL r9, r9, #4 + ORR r9, r9, r8, LSR #28 + LSL r8, r8, #4 + ORR r8, r8, r7, LSR #28 + LSL r7, r7, #4 + ORR r7, r7, r6, LSR #28 + LSL r6, r6, #4 + ORR r6, r6, r5, LSR #28 + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + BFC r9, #28, #4 + SUB r0, r0, #0x1c + # Add order times bits 504..511 + MOV r10, #0x2c13 + MOVT r10, #0xa30a + MOV r11, #0x9ce5 + MOVT r11, #0xa7ed + MOV r1, #0x0 + UMLAL r2, r1, r10, lr + UMAAL r3, r1, r11, lr + MOV r10, #0x6329 + MOVT r10, #0x5d08 + MOV r11, #0x621 + MOVT r11, #0xeb21 + UMAAL r4, r1, r10, lr + UMAAL r5, r1, r11, lr + ADDS r6, r6, r1 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + SUBS r6, r6, lr + SBCS r7, r7, #0x0 + SBCS r8, r8, #0x0 + SBC r9, r9, #0x0 + # Sub product of top 8 words and order + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM r0!, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM r0!, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM r0!, {r10, r11} + UMAAL r10, lr, r8, r1 + BFC r11, #28, #4 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB r0, r0, #0x10 + SUB sp, sp, #0x20 + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV lr, #0x0 + LDM sp, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + UMAAL r10, lr, r8, r1 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB sp, sp, #0x20 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV lr, #0x0 + LDM sp, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + UMAAL r10, lr, r8, r1 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB sp, sp, #0x20 + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV lr, #0x0 + LDM sp, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + UMAAL r10, lr, r8, r1 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB sp, sp, #0x20 + # Subtract at 4 * 32 + LDM sp, {r10, r11, r12} + SUBS r10, r10, r2 + SBCS r11, r11, r3 + SBCS r12, r12, r4 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + SBCS r10, r10, r5 + SBCS r11, r11, r6 + SBCS r12, r12, r7 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + SBCS r10, r10, r8 + SBC r11, r11, r9 + STM sp!, {r10, r11} + SUB sp, sp, #0x24 + ASR lr, r11, #25 + # Conditionally subtract order starting at bit 125 + MOV r1, #0xa0000000 + MOV r2, #0xba7d + MOVT r2, #0x4b9e + MOV r3, #0x4c63 + MOVT r3, #0xcb02 + MOV r4, #0xf39a + MOVT r4, #0xd45e + MOV r5, #0xdf3b + MOVT r5, #0x29b + MOV r9, #0x2000000 + AND r1, r1, lr + AND r2, r2, lr + AND r3, r3, lr + AND r4, r4, lr + AND r5, r5, lr + AND r9, r9, lr + LDM sp, {r10, r11, r12} + ADDS r10, r10, r1 + ADCS r11, r11, r2 + ADCS r12, r12, r3 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + ADCS r10, r10, r4 + ADCS r11, r11, r5 + ADCS r12, r12, #0x0 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + ADCS r10, r10, #0x0 + ADCS r11, r11, #0x0 + ADCS r12, r12, r9 + STM sp!, {r10, r11, r12} + SUB sp, sp, #0x30 + SUB r0, r0, #0x10 + # Load bits 252-376 + ADD sp, sp, #0x1c + LDM sp, {r1, r2, r3, r4, r5} + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + BFC r5, #29, #3 + SUB sp, sp, #0x1c + # Sub product of top 8 words and order + # * -5cf5d3ed + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, lr, r2, r1 + UMAAL r7, lr, r3, r1 + UMAAL r8, lr, r4, r1 + UMAAL r9, lr, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # * -5812631b + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV r10, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, r10, r2, r1 + UMAAL r7, r10, r3, r1 + UMAAL r8, r10, r4, r1 + UMAAL r9, r10, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # * -a2f79cd7 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV r11, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, r11, r2, r1 + UMAAL r7, r11, r3, r1 + UMAAL r8, r11, r4, r1 + UMAAL r9, r11, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # * -14def9df + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV r12, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, r12, r2, r1 + UMAAL r7, r12, r3, r1 + UMAAL r8, r12, r4, r1 + UMAAL r9, r12, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # Add overflows at 4 * 32 + LDM sp, {r6, r7, r8, r9} + BFC r9, #28, #4 + ADDS r6, r6, lr + ADCS r7, r7, r10 + ADCS r8, r8, r11 + ADC r9, r9, r12 + # Subtract top at 4 * 32 + SUBS r6, r6, r2 + SBCS r7, r7, r3 + SBCS r8, r8, r4 + SBCS r9, r9, r5 + SBC r1, r1, r1 + SUB sp, sp, #0x10 + LDM sp, {r2, r3, r4, r5} + MOV r10, #0xd3ed + MOVT r10, #0x5cf5 + MOV r11, #0x631a + MOVT r11, #0x5812 + MOV r12, #0x9cd6 + MOVT r12, #0xa2f7 + MOV lr, #0xf9de + MOVT lr, #0x14de + AND r10, r10, r1 + AND r11, r11, r1 + AND r12, r12, r1 + AND lr, lr, r1 + ADDS r2, r2, r10 + ADCS r3, r3, r11 + ADCS r4, r4, r12 + ADCS r5, r5, lr + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + AND r1, r1, #0x10000000 + ADCS r8, r8, #0x0 + ADC r9, r9, r1 + BFC r9, #28, #4 + # Store result + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + ADD sp, sp, #0x34 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 482 + .size sc_reduce,.-sc_reduce + .text + .align 4 + .globl sc_muladd + .type sc_muladd, %function +sc_muladd: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x50 + ADD lr, sp, #0x44 + STM lr, {r0, r1, r3} + MOV lr, r2 + LDM r1, {r0, r1, r2, r3} + LDM lr!, {r4, r5, r6} + UMULL r10, r11, r0, r4 + UMULL r12, r7, r1, r4 + UMAAL r11, r12, r0, r5 + UMULL r8, r9, r2, r4 + UMAAL r12, r8, r1, r5 + UMAAL r12, r7, r0, r6 + UMAAL r8, r9, r3, r4 + STM sp, {r10, r11, r12} + UMAAL r7, r8, r2, r5 + LDM lr!, {r4} + UMULL r10, r11, r1, r6 + UMAAL r8, r9, r2, r6 + UMAAL r7, r10, r0, r4 + UMAAL r8, r11, r3, r5 + STR r7, [sp, #12] + UMAAL r8, r10, r1, r4 + UMAAL r9, r11, r3, r6 + UMAAL r9, r10, r2, r4 + UMAAL r10, r11, r3, r4 + LDM lr, {r4, r5, r6, r7} + MOV r12, #0x0 + UMLAL r8, r12, r0, r4 + UMAAL r9, r12, r1, r4 + UMAAL r10, r12, r2, r4 + UMAAL r11, r12, r3, r4 + MOV r4, #0x0 + UMLAL r9, r4, r0, r5 + UMAAL r10, r4, r1, r5 + UMAAL r11, r4, r2, r5 + UMAAL r12, r4, r3, r5 + MOV r5, #0x0 + UMLAL r10, r5, r0, r6 + UMAAL r11, r5, r1, r6 + UMAAL r12, r5, r2, r6 + UMAAL r4, r5, r3, r6 + MOV r6, #0x0 + UMLAL r11, r6, r0, r7 + LDR r0, [sp, #72] + UMAAL r12, r6, r1, r7 + ADD r0, r0, #0x10 + UMAAL r4, r6, r2, r7 + SUB lr, lr, #0x10 + UMAAL r5, r6, r3, r7 + LDM r0, {r0, r1, r2, r3} + STR r6, [sp, #64] + LDM lr!, {r6} + MOV r7, #0x0 + UMLAL r8, r7, r0, r6 + UMAAL r9, r7, r1, r6 + STR r8, [sp, #16] + UMAAL r10, r7, r2, r6 + UMAAL r11, r7, r3, r6 + LDM lr!, {r6} + MOV r8, #0x0 + UMLAL r9, r8, r0, r6 + UMAAL r10, r8, r1, r6 + STR r9, [sp, #20] + UMAAL r11, r8, r2, r6 + UMAAL r12, r8, r3, r6 + LDM lr!, {r6} + MOV r9, #0x0 + UMLAL r10, r9, r0, r6 + UMAAL r11, r9, r1, r6 + STR r10, [sp, #24] + UMAAL r12, r9, r2, r6 + UMAAL r4, r9, r3, r6 + LDM lr!, {r6} + MOV r10, #0x0 + UMLAL r11, r10, r0, r6 + UMAAL r12, r10, r1, r6 + STR r11, [sp, #28] + UMAAL r4, r10, r2, r6 + UMAAL r5, r10, r3, r6 + LDM lr!, {r11} + UMAAL r12, r7, r0, r11 + UMAAL r4, r7, r1, r11 + LDR r6, [sp, #64] + UMAAL r5, r7, r2, r11 + UMAAL r6, r7, r3, r11 + LDM lr!, {r11} + UMAAL r4, r8, r0, r11 + UMAAL r5, r8, r1, r11 + UMAAL r6, r8, r2, r11 + UMAAL r7, r8, r3, r11 + LDM lr, {r11, lr} + UMAAL r5, r9, r0, r11 + UMAAL r6, r10, r0, lr + UMAAL r6, r9, r1, r11 + UMAAL r7, r10, r1, lr + UMAAL r7, r9, r2, r11 + UMAAL r8, r10, r2, lr + UMAAL r8, r9, r3, r11 + UMAAL r9, r10, r3, lr + MOV r3, r12 + ADD lr, sp, #0x20 + STM lr, {r3, r4, r5, r6, r7, r8, r9, r10} + LDR r0, [sp, #68] + # Add c to a * b + LDR lr, [sp, #76] + LDM sp!, {r2, r3, r4, r5, r6, r7, r8, r9} + LDM lr!, {r1, r10, r11, r12} + ADDS r2, r2, r1 + ADCS r3, r3, r10 + ADCS r4, r4, r11 + ADCS r5, r5, r12 + LDM lr!, {r1, r10, r11, r12} + ADCS r6, r6, r1 + ADCS r7, r7, r10 + ADCS r8, r8, r11 + ADCS r9, r9, r12 + MOV r1, r9 + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + LDM sp, {r2, r3, r4, r5, r6, r7, r8, r9} + ADCS r2, r2, #0x0 + ADCS r3, r3, #0x0 + ADCS r4, r4, #0x0 + ADCS r5, r5, #0x0 + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + SUB sp, sp, #0x20 + # Get 252..503 and 504..507 + LSR lr, r9, #24 + BFC r9, #24, #8 + LSL r9, r9, #4 + ORR r9, r9, r8, LSR #28 + LSL r8, r8, #4 + ORR r8, r8, r7, LSR #28 + LSL r7, r7, #4 + ORR r7, r7, r6, LSR #28 + LSL r6, r6, #4 + ORR r6, r6, r5, LSR #28 + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + # Add order times bits 504..507 + MOV r10, #0x2c13 + MOVT r10, #0xa30a + MOV r11, #0x9ce5 + MOVT r11, #0xa7ed + MOV r1, #0x0 + UMLAL r2, r1, r10, lr + UMAAL r3, r1, r11, lr + MOV r10, #0x6329 + MOVT r10, #0x5d08 + MOV r11, #0x621 + MOVT r11, #0xeb21 + UMAAL r4, r1, r10, lr + UMAAL r5, r1, r11, lr + ADDS r6, r6, r1 + ADCS r7, r7, #0x0 + ADCS r8, r8, #0x0 + ADC r9, r9, #0x0 + SUBS r6, r6, lr + SBCS r7, r7, #0x0 + SBCS r8, r8, #0x0 + SBC r9, r9, #0x0 + # Sub product of top 8 words and order + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM r0!, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM r0!, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM r0!, {r10, r11} + UMAAL r10, lr, r8, r1 + BFC r11, #28, #4 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB r0, r0, #0x10 + SUB sp, sp, #0x20 + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV lr, #0x0 + LDM sp, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + UMAAL r10, lr, r8, r1 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB sp, sp, #0x20 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV lr, #0x0 + LDM sp, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + UMAAL r10, lr, r8, r1 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB sp, sp, #0x20 + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV lr, #0x0 + LDM sp, {r10, r11, r12} + UMLAL r10, lr, r2, r1 + UMAAL r11, lr, r3, r1 + UMAAL r12, lr, r4, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + UMAAL r10, lr, r5, r1 + UMAAL r11, lr, r6, r1 + UMAAL r12, lr, r7, r1 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + UMAAL r10, lr, r8, r1 + UMAAL r11, lr, r9, r1 + STM sp!, {r10, r11, lr} + SUB sp, sp, #0x20 + # Subtract at 4 * 32 + LDM sp, {r10, r11, r12} + SUBS r10, r10, r2 + SBCS r11, r11, r3 + SBCS r12, r12, r4 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + SBCS r10, r10, r5 + SBCS r11, r11, r6 + SBCS r12, r12, r7 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11} + SBCS r10, r10, r8 + SBC r11, r11, r9 + STM sp!, {r10, r11} + SUB sp, sp, #0x24 + ASR lr, r11, #25 + # Conditionally subtract order starting at bit 125 + MOV r1, #0xa0000000 + MOV r2, #0xba7d + MOVT r2, #0x4b9e + MOV r3, #0x4c63 + MOVT r3, #0xcb02 + MOV r4, #0xf39a + MOVT r4, #0xd45e + MOV r5, #0xdf3b + MOVT r5, #0x29b + MOV r9, #0x2000000 + AND r1, r1, lr + AND r2, r2, lr + AND r3, r3, lr + AND r4, r4, lr + AND r5, r5, lr + AND r9, r9, lr + LDM sp, {r10, r11, r12} + ADDS r10, r10, r1 + ADCS r11, r11, r2 + ADCS r12, r12, r3 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + ADCS r10, r10, r4 + ADCS r11, r11, r5 + ADCS r12, r12, #0x0 + STM sp!, {r10, r11, r12} + LDM sp, {r10, r11, r12} + ADCS r10, r10, #0x0 + ADCS r11, r11, #0x0 + ADCS r12, r12, r9 + STM sp!, {r10, r11, r12} + SUB sp, sp, #0x30 + SUB r0, r0, #0x10 + # Load bits 252-376 + ADD sp, sp, #0x1c + LDM sp, {r1, r2, r3, r4, r5} + LSL r5, r5, #4 + ORR r5, r5, r4, LSR #28 + LSL r4, r4, #4 + ORR r4, r4, r3, LSR #28 + LSL r3, r3, #4 + ORR r3, r3, r2, LSR #28 + LSL r2, r2, #4 + ORR r2, r2, r1, LSR #28 + BFC r5, #29, #3 + SUB sp, sp, #0x1c + # Sub product of top 8 words and order + # * -5cf5d3ed + MOV r1, #0x2c13 + MOVT r1, #0xa30a + MOV lr, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, lr, r2, r1 + UMAAL r7, lr, r3, r1 + UMAAL r8, lr, r4, r1 + UMAAL r9, lr, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # * -5812631b + MOV r1, #0x9ce5 + MOVT r1, #0xa7ed + MOV r10, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, r10, r2, r1 + UMAAL r7, r10, r3, r1 + UMAAL r8, r10, r4, r1 + UMAAL r9, r10, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # * -a2f79cd7 + MOV r1, #0x6329 + MOVT r1, #0x5d08 + MOV r11, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, r11, r2, r1 + UMAAL r7, r11, r3, r1 + UMAAL r8, r11, r4, r1 + UMAAL r9, r11, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # * -14def9df + MOV r1, #0x621 + MOVT r1, #0xeb21 + MOV r12, #0x0 + LDM sp, {r6, r7, r8, r9} + UMLAL r6, r12, r2, r1 + UMAAL r7, r12, r3, r1 + UMAAL r8, r12, r4, r1 + UMAAL r9, r12, r5, r1 + STM sp, {r6, r7, r8, r9} + ADD sp, sp, #0x4 + # Add overflows at 4 * 32 + LDM sp, {r6, r7, r8, r9} + BFC r9, #28, #4 + ADDS r6, r6, lr + ADCS r7, r7, r10 + ADCS r8, r8, r11 + ADC r9, r9, r12 + # Subtract top at 4 * 32 + SUBS r6, r6, r2 + SBCS r7, r7, r3 + SBCS r8, r8, r4 + SBCS r9, r9, r5 + SBC r1, r1, r1 + SUB sp, sp, #0x10 + LDM sp, {r2, r3, r4, r5} + MOV r10, #0xd3ed + MOVT r10, #0x5cf5 + MOV r11, #0x631a + MOVT r11, #0x5812 + MOV r12, #0x9cd6 + MOVT r12, #0xa2f7 + MOV lr, #0xf9de + MOVT lr, #0x14de + AND r10, r10, r1 + AND r11, r11, r1 + AND r12, r12, r1 + AND lr, lr, r1 + ADDS r2, r2, r10 + ADCS r3, r3, r11 + ADCS r4, r4, r12 + ADCS r5, r5, lr + ADCS r6, r6, #0x0 + ADCS r7, r7, #0x0 + AND r1, r1, #0x10000000 + ADCS r8, r8, #0x0 + ADC r9, r9, r1 + BFC r9, #28, #4 + # Store result + STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} + ADD sp, sp, #0x50 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 728 + .size sc_muladd,.-sc_muladd +#endif /* HAVE_ED25519 */ + +#endif /* !CURVE25519_SMALL || !ED25519_SMALL */ +#endif /* HAVE_CURVE25519 || HAVE_ED25519 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c new file mode 100644 index 0000000000..e28885cb6a --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c @@ -0,0 +1,4111 @@ +/* thumb2-curve25519 + * + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./x25519/x25519.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-curve25519.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#ifdef WOLFSSL_ARMASM_INLINE +/* Based on work by: Emil Lenngren + * https://github.com/pornin/X25519-Cortex-M4 + */ + +#include +#define CURVED25519_ASM +#include + +#if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) +#if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) + +void fe_init(void) +{ + __asm__ __volatile__ ( + "\n\t" + : + : + : "memory" + ); +} + +void fe_add_sub_op(void); +void fe_add_sub_op(void) +{ + __asm__ __volatile__ ( + /* Add-Sub */ + "LDRD r4, r5, [r2]\n\t" + "LDRD r6, r7, [r3]\n\t" + /* Add */ + "ADDS r8, r4, r6\n\t" + "MOV r12, #0x0\n\t" + "ADCS r9, r5, r7\n\t" + "ADC r12, r12, #0x0\n\t" + "STRD r8, r9, [r0]\n\t" + /* Sub */ + "SUBS r10, r4, r6\n\t" + "SBCS r11, r5, r7\n\t" + "STRD r10, r11, [r1]\n\t" + "LDRD r4, r5, [r2, #8]\n\t" + "LDRD r6, r7, [r3, #8]\n\t" + /* Sub */ + "SBCS r10, r4, r6\n\t" + "MOV lr, #0x0\n\t" + "SBCS r11, r5, r7\n\t" + "ADC lr, lr, #0x0\n\t" + "STRD r10, r11, [r1, #8]\n\t" + /* Add */ + "SUBS r12, r12, #0x1\n\t" + "ADCS r8, r4, r6\n\t" + "ADCS r9, r5, r7\n\t" + "STRD r8, r9, [r0, #8]\n\t" + "LDRD r4, r5, [r2, #16]\n\t" + "LDRD r6, r7, [r3, #16]\n\t" + /* Add */ + "ADCS r8, r4, r6\n\t" + "MOV r12, #0x0\n\t" + "ADCS r9, r5, r7\n\t" + "ADC r12, r12, #0x0\n\t" + "STRD r8, r9, [r0, #16]\n\t" + /* Sub */ + "SUBS lr, lr, #0x1\n\t" + "SBCS r10, r4, r6\n\t" + "SBCS r11, r5, r7\n\t" + "STRD r10, r11, [r1, #16]\n\t" + "LDRD r4, r5, [r2, #24]\n\t" + "LDRD r6, r7, [r3, #24]\n\t" + /* Sub */ + "SBCS r10, r4, r6\n\t" + "SBC r11, r5, r7\n\t" + /* Add */ + "SUBS r12, r12, #0x1\n\t" + "ADCS r8, r4, r6\n\t" + "MOV r12, #0x0\n\t" + "ADCS r9, r5, r7\n\t" + "ADC r12, r12, #0x0\n\t" + /* Multiply -modulus by overflow */ + "LSL r3, r12, #1\n\t" + "MOV r12, #0x13\n\t" + "ORR r3, r3, r9, LSR #31\n\t" + "MUL r12, r3, r12\n\t" + /* Add -x*modulus (if overflow) */ + "LDRD r4, r5, [r0]\n\t" + "LDRD r6, r7, [r0, #8]\n\t" + "ADDS r4, r4, r12\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "STRD r4, r5, [r0]\n\t" + "STRD r6, r7, [r0, #8]\n\t" + "LDRD r4, r5, [r0, #16]\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "STRD r4, r5, [r0, #16]\n\t" + "BFC r9, #31, #1\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "STRD r8, r9, [r0, #24]\n\t" + /* Add -modulus on underflow */ + "MOV lr, #0x13\n\t" + "AND lr, lr, r11, ASR #31\n\t" + "LDM r1, {r4, r5, r6, r7, r8, r9}\n\t" + "SUBS r4, r4, lr\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "BFC r11, #31, #1\n\t" + "SBCS r10, r10, #0x0\n\t" + "SBC r11, r11, #0x0\n\t" + "STM r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Add-Sub */ + : + : + : "memory", "lr" + ); +} + +void fe_sub_op(void); +void fe_sub_op(void) +{ + __asm__ __volatile__ ( + /* Sub */ + "LDM r2!, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + "LDM r1!, {r2, r3, r4, r5}\n\t" + "SUBS r6, r2, r6\n\t" + "SBCS r7, r3, r7\n\t" + "SBCS r8, r4, r8\n\t" + "SBCS r9, r5, r9\n\t" + "LDM r1!, {r2, r3, r4, r5}\n\t" + "SBCS r10, r2, r10\n\t" + "SBCS r11, r3, r11\n\t" + "SBCS r12, r4, r12\n\t" + "SBC lr, r5, lr\n\t" + "MOV r2, #0x13\n\t" + "AND r2, r2, lr, ASR #31\n\t" + "SUBS r6, r6, r2\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, #0x0\n\t" + "SBCS r11, r11, #0x0\n\t" + "BFC lr, #31, #1\n\t" + "SBCS r12, r12, #0x0\n\t" + "SBC lr, lr, #0x0\n\t" + "STM r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + /* Done Sub */ + : + : + : "memory", "lr" + ); +} + +void fe_sub(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "BL fe_sub_op\n\t" + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void fe_add_op(void); +void fe_add_op(void) +{ + __asm__ __volatile__ ( + /* Add */ + "LDM r2!, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + "LDM r1!, {r2, r3, r4, r5}\n\t" + "ADDS r6, r2, r6\n\t" + "ADCS r7, r3, r7\n\t" + "ADCS r8, r4, r8\n\t" + "ADCS r9, r5, r9\n\t" + "LDM r1!, {r2, r3, r4, r5}\n\t" + "ADCS r10, r2, r10\n\t" + "ADCS r11, r3, r11\n\t" + "ADCS r12, r4, r12\n\t" + "ADC lr, r5, lr\n\t" + "MOV r2, #0x13\n\t" + "AND r2, r2, lr, ASR #31\n\t" + "ADDS r6, r6, r2\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "BFC lr, #31, #1\n\t" + "ADCS r12, r12, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "STM r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" + /* Done Add */ + : + : + : "memory", "lr" + ); +} + +void fe_add(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "BL fe_add_op\n\t" + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#ifdef HAVE_ED25519 +void fe_frombytes(fe out, const unsigned char* in) +{ + __asm__ __volatile__ ( + "LDM %[in], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "BFC r9, #31, #1\n\t" + "STM %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + : [out] "+l" (out), [in] "+l" (in) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + ); +} + +void fe_tobytes(unsigned char* out, const fe n) +{ + __asm__ __volatile__ ( + "LDM %[n], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADDS r10, r2, #0x13\n\t" + "ADCS r10, r3, #0x0\n\t" + "ADCS r10, r4, #0x0\n\t" + "ADCS r10, r5, #0x0\n\t" + "ADCS r10, r6, #0x0\n\t" + "ADCS r10, r7, #0x0\n\t" + "ADCS r10, r8, #0x0\n\t" + "ADC r10, r9, #0x0\n\t" + "ASR r10, r10, #31\n\t" + "AND r10, r10, #0x13\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "BFC r9, #31, #1\n\t" + "STM %[out], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + : [out] "+l" (out), [n] "+l" (n) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); +} + +void fe_1(fe n) +{ + __asm__ __volatile__ ( + /* Set one */ + "MOV r2, #0x1\n\t" + "MOV r3, #0x0\n\t" + "STRD r2, r3, [%[n]]\n\t" + "MOV r2, #0x0\n\t" + "STRD r2, r3, [%[n], #8]\n\t" + "STRD r2, r3, [%[n], #16]\n\t" + "STRD r2, r3, [%[n], #24]\n\t" + : [n] "+l" (n) + : + : "memory", "r2", "r3" + ); +} + +void fe_0(fe n) +{ + __asm__ __volatile__ ( + /* Set zero */ + "MOV r2, #0x0\n\t" + "MOV r3, #0x0\n\t" + "STRD r2, r3, [%[n]]\n\t" + "STRD r2, r3, [%[n], #8]\n\t" + "STRD r2, r3, [%[n], #16]\n\t" + "STRD r2, r3, [%[n], #24]\n\t" + : [n] "+l" (n) + : + : "memory", "r2", "r3" + ); +} + +void fe_copy(fe r, const fe a) +{ + __asm__ __volatile__ ( + /* Copy */ + "LDRD r2, r3, [%[a]]\n\t" + "LDRD r4, r5, [%[a], #8]\n\t" + "STRD r2, r3, [%[r]]\n\t" + "STRD r4, r5, [%[r], #8]\n\t" + "LDRD r2, r3, [%[a], #16]\n\t" + "LDRD r4, r5, [%[a], #24]\n\t" + "STRD r2, r3, [%[r], #16]\n\t" + "STRD r4, r5, [%[r], #24]\n\t" + : [r] "+l" (r), [a] "+l" (a) + : + : "memory", "r2", "r3", "r4", "r5" + ); +} + +void fe_neg(fe r, const fe a) +{ + __asm__ __volatile__ ( + "MVN r7, #0x0\n\t" + "MVN r6, #0x12\n\t" + "LDM %[a]!, {r2, r3, r4, r5}\n\t" + "SUBS r2, r6, r2\n\t" + "SBCS r3, r7, r3\n\t" + "SBCS r4, r7, r4\n\t" + "SBCS r5, r7, r5\n\t" + "STM %[r]!, {r2, r3, r4, r5}\n\t" + "MVN r6, #0x80000000\n\t" + "LDM %[a]!, {r2, r3, r4, r5}\n\t" + "SBCS r2, r7, r2\n\t" + "SBCS r3, r7, r3\n\t" + "SBCS r4, r7, r4\n\t" + "SBC r5, r6, r5\n\t" + "STM %[r]!, {r2, r3, r4, r5}\n\t" + : [r] "+l" (r), [a] "+l" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7" + ); +} + +int fe_isnonzero(const fe a) +{ + __asm__ __volatile__ ( + "LDM %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADDS r1, r2, #0x13\n\t" + "ADCS r1, r3, #0x0\n\t" + "ADCS r1, r4, #0x0\n\t" + "ADCS r1, r5, #0x0\n\t" + "ADCS r1, r6, #0x0\n\t" + "ADCS r1, r7, #0x0\n\t" + "ADCS r1, r8, #0x0\n\t" + "ADC r1, r9, #0x0\n\t" + "ASR r1, r1, #31\n\t" + "AND r1, r1, #0x13\n\t" + "ADDS r2, r2, r1\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "BFC r9, #31, #1\n\t" + "ORR r2, r2, r3\n\t" + "ORR r4, r4, r5\n\t" + "ORR r6, r6, r7\n\t" + "ORR r8, r8, r9\n\t" + "ORR r4, r4, r6\n\t" + "ORR r2, r2, r8\n\t" + "ORR %[a], r2, r4\n\t" + : [a] "+l" (a) + : + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)a; +} + +int fe_isnegative(const fe a) +{ + __asm__ __volatile__ ( + "LDM %[a]!, {r2, r3, r4, r5}\n\t" + "ADDS r1, r2, #0x13\n\t" + "ADCS r1, r3, #0x0\n\t" + "ADCS r1, r4, #0x0\n\t" + "ADCS r1, r5, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "ADCS r1, r2, #0x0\n\t" + "ADCS r1, r3, #0x0\n\t" + "ADCS r1, r4, #0x0\n\t" + "LDR r2, [%[a], #-16]\n\t" + "ADC r1, r5, #0x0\n\t" + "AND %[a], r2, #0x1\n\t" + "LSR r1, r1, #31\n\t" + "EOR %[a], %[a], r1\n\t" + : [a] "+l" (a) + : + : "memory", "r1", "r2", "r3", "r4", "r5" + ); + return (uint32_t)(size_t)a; +} + +#ifndef WC_NO_CACHE_RESISTANT +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "SXTB %[b], %[b]\n\t" + "SBFX r3, %[b], #7, #1\n\t" + "EOR r12, %[b], r3\n\t" + "SUB r12, r12, r3\n\t" + "MOV r4, #0x1\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r9, #0x0\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #31\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #30\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #29\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #28\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #27\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #26\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #25\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #24\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base]]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #32]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #64]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "SUB %[base], %[base], #0x2a0\n\t" + "MVN r10, #0x12\n\t" + "MVN r11, #0x0\n\t" + "SUBS r10, r10, r8\n\t" + "SBCS r11, r11, r9\n\t" + "SBC lr, lr, lr\n\t" + "ASR r12, %[b], #31\n\t" + "EOR r3, r4, r6\n\t" + "AND r3, r3, r12\n\t" + "EOR r4, r4, r3\n\t" + "EOR r6, r6, r3\n\t" + "EOR r3, r5, r7\n\t" + "AND r3, r3, r12\n\t" + "EOR r5, r5, r3\n\t" + "EOR r7, r7, r3\n\t" + "EOR r10, r10, r8\n\t" + "AND r10, r10, r12\n\t" + "EOR r8, r8, r10\n\t" + "EOR r11, r11, r9\n\t" + "AND r11, r11, r12\n\t" + "EOR r9, r9, r11\n\t" + "STRD r4, r5, [%[r]]\n\t" + "STRD r6, r7, [%[r], #32]\n\t" + "STRD r8, r9, [%[r], #64]\n\t" + "SBFX r3, %[b], #7, #1\n\t" + "EOR r12, %[b], r3\n\t" + "SUB r12, r12, r3\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r9, #0x0\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #31\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #30\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #29\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #28\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #27\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #26\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #25\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #24\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #8]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #40]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #72]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "SUB %[base], %[base], #0x2a0\n\t" + "MVN r10, #0x0\n\t" + "MVN r11, #0x0\n\t" + "RSBS lr, lr, #0x0\n\t" + "SBCS r10, r10, r8\n\t" + "SBCS r11, r11, r9\n\t" + "SBC lr, lr, lr\n\t" + "ASR r12, %[b], #31\n\t" + "EOR r3, r4, r6\n\t" + "AND r3, r3, r12\n\t" + "EOR r4, r4, r3\n\t" + "EOR r6, r6, r3\n\t" + "EOR r3, r5, r7\n\t" + "AND r3, r3, r12\n\t" + "EOR r5, r5, r3\n\t" + "EOR r7, r7, r3\n\t" + "EOR r10, r10, r8\n\t" + "AND r10, r10, r12\n\t" + "EOR r8, r8, r10\n\t" + "EOR r11, r11, r9\n\t" + "AND r11, r11, r12\n\t" + "EOR r9, r9, r11\n\t" + "STRD r4, r5, [%[r], #8]\n\t" + "STRD r6, r7, [%[r], #40]\n\t" + "STRD r8, r9, [%[r], #72]\n\t" + "SBFX r3, %[b], #7, #1\n\t" + "EOR r12, %[b], r3\n\t" + "SUB r12, r12, r3\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r9, #0x0\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #31\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #30\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #29\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #28\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #27\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #26\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #25\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #24\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #48]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #80]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "SUB %[base], %[base], #0x2a0\n\t" + "MVN r10, #0x0\n\t" + "MVN r11, #0x0\n\t" + "RSBS lr, lr, #0x0\n\t" + "SBCS r10, r10, r8\n\t" + "SBCS r11, r11, r9\n\t" + "SBC lr, lr, lr\n\t" + "ASR r12, %[b], #31\n\t" + "EOR r3, r4, r6\n\t" + "AND r3, r3, r12\n\t" + "EOR r4, r4, r3\n\t" + "EOR r6, r6, r3\n\t" + "EOR r3, r5, r7\n\t" + "AND r3, r3, r12\n\t" + "EOR r5, r5, r3\n\t" + "EOR r7, r7, r3\n\t" + "EOR r10, r10, r8\n\t" + "AND r10, r10, r12\n\t" + "EOR r8, r8, r10\n\t" + "EOR r11, r11, r9\n\t" + "AND r11, r11, r12\n\t" + "EOR r9, r9, r11\n\t" + "STRD r4, r5, [%[r], #16]\n\t" + "STRD r6, r7, [%[r], #48]\n\t" + "STRD r8, r9, [%[r], #80]\n\t" + "SBFX r3, %[b], #7, #1\n\t" + "EOR r12, %[b], r3\n\t" + "SUB r12, r12, r3\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r9, #0x0\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #31\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #30\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #29\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #28\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #27\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #26\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #25\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "ADD %[base], %[base], #0x60\n\t" + "MOV r3, #0x80000000\n\t" + "ROR r3, r3, #24\n\t" + "ROR r3, r3, r12\n\t" + "ASR r3, r3, #31\n\t" + "LDRD r10, r11, [%[base], #24]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r4, r4, r10\n\t" + "EOR r5, r5, r11\n\t" + "LDRD r10, r11, [%[base], #56]\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r6, r6, r10\n\t" + "EOR r7, r7, r11\n\t" + "LDRD r10, r11, [%[base], #88]\n\t" + "EOR r10, r10, r8\n\t" + "EOR r11, r11, r9\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "EOR r8, r8, r10\n\t" + "EOR r9, r9, r11\n\t" + "SUB %[base], %[base], #0x2a0\n\t" + "MVN r10, #0x0\n\t" + "MVN r11, #0x80000000\n\t" + "RSBS lr, lr, #0x0\n\t" + "SBCS r10, r10, r8\n\t" + "SBC r11, r11, r9\n\t" + "ASR r12, %[b], #31\n\t" + "EOR r3, r4, r6\n\t" + "AND r3, r3, r12\n\t" + "EOR r4, r4, r3\n\t" + "EOR r6, r6, r3\n\t" + "EOR r3, r5, r7\n\t" + "AND r3, r3, r12\n\t" + "EOR r5, r5, r3\n\t" + "EOR r7, r7, r3\n\t" + "EOR r10, r10, r8\n\t" + "AND r10, r10, r12\n\t" + "EOR r8, r8, r10\n\t" + "EOR r11, r11, r9\n\t" + "AND r11, r11, r12\n\t" + "EOR r9, r9, r11\n\t" + "STRD r4, r5, [%[r], #24]\n\t" + "STRD r6, r7, [%[r], #56]\n\t" + "STRD r8, r9, [%[r], #88]\n\t" + : [r] "+l" (r), [base] "+l" (base), [b] "+l" (b) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r3", "r10", "r11", "r12", "lr" + ); +} + +#else +void fe_cmov_table(fe* r, fe* base, signed char b) +{ + __asm__ __volatile__ ( + "SXTB %[b], %[b]\n\t" + "SBFX r3, %[b], #7, #1\n\t" + "EOR %[b], %[b], r3\n\t" + "SUB %[b], %[b], r3\n\t" + "CLZ lr, %[b]\n\t" + "LSL lr, lr, #26\n\t" + "ASR lr, lr, #31\n\t" + "MVN lr, lr\n\t" + "ADD %[b], %[b], lr\n\t" + "MOV r12, #0x60\n\t" + "MUL %[b], %[b], r12\n\t" + "ADD %[base], %[base], %[b]\n\t" + "LDM %[base]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r6, r6, lr\n\t" + "AND r7, r7, lr\n\t" + "AND r8, r8, lr\n\t" + "AND r9, r9, lr\n\t" + "AND r10, r10, lr\n\t" + "AND r11, r11, lr\n\t" + "MVN r12, lr\n\t" + "SUB r4, r4, r12\n\t" + "MOV r12, #0x20\n\t" + "AND r12, r12, r3\n\t" + "ADD %[r], %[r], r12\n\t" + "STM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "SUB %[r], %[r], r12\n\t" + "LDM %[base]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r6, r6, lr\n\t" + "AND r7, r7, lr\n\t" + "AND r8, r8, lr\n\t" + "AND r9, r9, lr\n\t" + "AND r10, r10, lr\n\t" + "AND r11, r11, lr\n\t" + "MVN r12, lr\n\t" + "SUB r4, r4, r12\n\t" + "MOV r12, #0x20\n\t" + "BIC r12, r12, r3\n\t" + "ADD %[r], %[r], r12\n\t" + "STM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "SUB %[r], %[r], r12\n\t" + "ADD %[r], %[r], #0x40\n\t" + "LDM %[base]!, {r4, r5, r6, r7}\n\t" + "MVN r12, #0x12\n\t" + "SUBS r8, r12, r4\n\t" + "SBCS r9, r3, r5\n\t" + "SBCS r10, r3, r6\n\t" + "SBCS r11, r3, r7\n\t" + "BIC r4, r4, r3\n\t" + "BIC r5, r5, r3\n\t" + "BIC r6, r6, r3\n\t" + "BIC r7, r7, r3\n\t" + "AND r8, r8, r3\n\t" + "AND r9, r9, r3\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "ORR r4, r4, r8\n\t" + "ORR r5, r5, r9\n\t" + "ORR r6, r6, r10\n\t" + "ORR r7, r7, r11\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r6, r6, lr\n\t" + "AND r7, r7, lr\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[base]!, {r4, r5, r6, r7}\n\t" + "MVN r12, #0x80000000\n\t" + "SBCS r8, r3, r4\n\t" + "SBCS r9, r3, r5\n\t" + "SBCS r10, r3, r6\n\t" + "SBC r11, r12, r7\n\t" + "BIC r4, r4, r3\n\t" + "BIC r5, r5, r3\n\t" + "BIC r6, r6, r3\n\t" + "BIC r7, r7, r3\n\t" + "AND r8, r8, r3\n\t" + "AND r9, r9, r3\n\t" + "AND r10, r10, r3\n\t" + "AND r11, r11, r3\n\t" + "ORR r4, r4, r8\n\t" + "ORR r5, r5, r9\n\t" + "ORR r6, r6, r10\n\t" + "ORR r7, r7, r11\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r6, r6, lr\n\t" + "AND r7, r7, lr\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "SUB %[base], %[base], %[b]\n\t" + : [r] "+l" (r), [base] "+l" (base), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif /* WC_NO_CACHE_RESISTANT */ +#endif /* HAVE_ED25519 */ +void fe_mul_op(void); +void fe_mul_op(void) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x2c\n\t" + "STRD r0, r1, [sp, #36]\n\t" + "MOV lr, r2\n\t" + "LDM r1, {r0, r1, r2, r3}\n\t" + "LDM lr!, {r4, r5, r6}\n\t" + "UMULL r10, r11, r0, r4\n\t" + "UMULL r12, r7, r1, r4\n\t" + "UMAAL r11, r12, r0, r5\n\t" + "UMULL r8, r9, r2, r4\n\t" + "UMAAL r12, r8, r1, r5\n\t" + "UMAAL r12, r7, r0, r6\n\t" + "UMAAL r8, r9, r3, r4\n\t" + "STM sp, {r10, r11, r12}\n\t" + "UMAAL r7, r8, r2, r5\n\t" + "LDM lr!, {r4}\n\t" + "UMULL r10, r11, r1, r6\n\t" + "UMAAL r8, r9, r2, r6\n\t" + "UMAAL r7, r10, r0, r4\n\t" + "UMAAL r8, r11, r3, r5\n\t" + "STR r7, [sp, #12]\n\t" + "UMAAL r8, r10, r1, r4\n\t" + "UMAAL r9, r11, r3, r6\n\t" + "UMAAL r9, r10, r2, r4\n\t" + "UMAAL r10, r11, r3, r4\n\t" + "LDM lr, {r4, r5, r6, r7}\n\t" + "MOV r12, #0x0\n\t" + "UMLAL r8, r12, r0, r4\n\t" + "UMAAL r9, r12, r1, r4\n\t" + "UMAAL r10, r12, r2, r4\n\t" + "UMAAL r11, r12, r3, r4\n\t" + "MOV r4, #0x0\n\t" + "UMLAL r9, r4, r0, r5\n\t" + "UMAAL r10, r4, r1, r5\n\t" + "UMAAL r11, r4, r2, r5\n\t" + "UMAAL r12, r4, r3, r5\n\t" + "MOV r5, #0x0\n\t" + "UMLAL r10, r5, r0, r6\n\t" + "UMAAL r11, r5, r1, r6\n\t" + "UMAAL r12, r5, r2, r6\n\t" + "UMAAL r4, r5, r3, r6\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r11, r6, r0, r7\n\t" + "LDR r0, [sp, #40]\n\t" + "UMAAL r12, r6, r1, r7\n\t" + "ADD r0, r0, #0x10\n\t" + "UMAAL r4, r6, r2, r7\n\t" + "SUB lr, lr, #0x10\n\t" + "UMAAL r5, r6, r3, r7\n\t" + "LDM r0, {r0, r1, r2, r3}\n\t" + "STR r6, [sp, #32]\n\t" + "LDM lr!, {r6}\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r8, r7, r0, r6\n\t" + "UMAAL r9, r7, r1, r6\n\t" + "STR r8, [sp, #16]\n\t" + "UMAAL r10, r7, r2, r6\n\t" + "UMAAL r11, r7, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r8, #0x0\n\t" + "UMLAL r9, r8, r0, r6\n\t" + "UMAAL r10, r8, r1, r6\n\t" + "STR r9, [sp, #20]\n\t" + "UMAAL r11, r8, r2, r6\n\t" + "UMAAL r12, r8, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r10, r9, r0, r6\n\t" + "UMAAL r11, r9, r1, r6\n\t" + "STR r10, [sp, #24]\n\t" + "UMAAL r12, r9, r2, r6\n\t" + "UMAAL r4, r9, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r10, #0x0\n\t" + "UMLAL r11, r10, r0, r6\n\t" + "UMAAL r12, r10, r1, r6\n\t" + "STR r11, [sp, #28]\n\t" + "UMAAL r4, r10, r2, r6\n\t" + "UMAAL r5, r10, r3, r6\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r12, r7, r0, r11\n\t" + "UMAAL r4, r7, r1, r11\n\t" + "LDR r6, [sp, #32]\n\t" + "UMAAL r5, r7, r2, r11\n\t" + "UMAAL r6, r7, r3, r11\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r4, r8, r0, r11\n\t" + "UMAAL r5, r8, r1, r11\n\t" + "UMAAL r6, r8, r2, r11\n\t" + "UMAAL r7, r8, r3, r11\n\t" + "LDM lr, {r11, lr}\n\t" + "UMAAL r5, r9, r0, r11\n\t" + "UMAAL r6, r10, r0, lr\n\t" + "UMAAL r6, r9, r1, r11\n\t" + "UMAAL r7, r10, r1, lr\n\t" + "UMAAL r7, r9, r2, r11\n\t" + "UMAAL r8, r10, r2, lr\n\t" + "UMAAL r8, r9, r3, r11\n\t" + "UMAAL r9, r10, r3, lr\n\t" + /* Reduce */ + "LDR r0, [sp, #28]\n\t" + "MOV lr, #0x25\n\t" + "UMAAL r10, r0, r10, lr\n\t" + "MOV lr, #0x13\n\t" + "LSL r0, r0, #1\n\t" + "ORR r0, r0, r10, LSR #31\n\t" + "MUL r11, r0, lr\n\t" + "POP {r0, r1, r2}\n\t" + "MOV lr, #0x26\n\t" + "UMAAL r0, r11, r12, lr\n\t" + "UMAAL r1, r11, r4, lr\n\t" + "UMAAL r2, r11, r5, lr\n\t" + "POP {r3, r4, r5}\n\t" + "UMAAL r3, r11, r6, lr\n\t" + "UMAAL r4, r11, r7, lr\n\t" + "UMAAL r5, r11, r8, lr\n\t" + "POP {r6}\n\t" + "BFC r10, #31, #1\n\t" + "UMAAL r6, r11, r9, lr\n\t" + "ADD r7, r10, r11\n\t" + "LDR lr, [sp, #8]\n\t" + /* Store */ + "STM lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + "ADD sp, sp, #0x10\n\t" + : + : + : "memory", "lr" + ); +} + +void fe_mul(fe r, const fe a, const fe b) +{ + __asm__ __volatile__ ( + "BL fe_mul_op\n\t" + : [r] "+l" (r), [a] "+l" (a), [b] "+l" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void fe_sq_op(void); +void fe_sq_op(void) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x20\n\t" + "STR r0, [sp, #28]\n\t" + "LDM r1, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + /* Square */ + "UMULL r9, r10, r0, r0\n\t" + "UMULL r11, r12, r0, r1\n\t" + "ADDS r11, r11, r11\n\t" + "MOV lr, #0x0\n\t" + "UMAAL r10, r11, lr, lr\n\t" + "STM sp, {r9, r10}\n\t" + "MOV r8, lr\n\t" + "UMAAL r8, r12, r0, r2\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r8, r11, r1, r1\n\t" + "UMULL r9, r10, r0, r3\n\t" + "UMAAL r9, r12, r1, r2\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STRD r8, r9, [sp, #8]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r10, r0, r4\n\t" + "UMAAL r9, r12, r1, r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r2, r2\n\t" + "STR r9, [sp, #16]\n\t" + "UMULL r9, r8, r0, r5\n\t" + "UMAAL r9, r12, r1, r4\n\t" + "UMAAL r9, r10, r2, r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STR r9, [sp, #20]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r8, r0, r6\n\t" + "UMAAL r9, r12, r1, r5\n\t" + "UMAAL r9, r10, r2, r4\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r3, r3\n\t" + "STR r9, [sp, #24]\n\t" + "UMULL r0, r9, r0, r7\n\t" + "UMAAL r0, r8, r1, r6\n\t" + "UMAAL r0, r12, r2, r5\n\t" + "UMAAL r0, r10, r3, r4\n\t" + "ADCS r0, r0, r0\n\t" + "UMAAL r0, r11, lr, lr\n\t" + /* R[7] = r0 */ + "UMAAL r9, r8, r1, r7\n\t" + "UMAAL r9, r10, r2, r6\n\t" + "UMAAL r12, r9, r3, r5\n\t" + "ADCS r12, r12, r12\n\t" + "UMAAL r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "UMAAL r9, r8, r2, r7\n\t" + "UMAAL r10, r9, r3, r6\n\t" + "MOV r2, lr\n\t" + "UMAAL r10, r2, r4, r5\n\t" + "ADCS r10, r10, r10\n\t" + "UMAAL r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "UMAAL r2, r8, r3, r7\n\t" + "UMAAL r2, r9, r4, r6\n\t" + "ADCS r3, r2, r2\n\t" + "UMAAL r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "MOV r1, lr\n\t" + "UMAAL r1, r8, r4, r7\n\t" + "UMAAL r1, r9, r5, r6\n\t" + "ADCS r4, r1, r1\n\t" + "UMAAL r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "UMAAL r8, r9, r5, r7\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "MOV r5, lr\n\t" + "UMAAL r5, r9, r6, r7\n\t" + "ADCS r5, r5, r5\n\t" + "UMAAL r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r5, r7, r7\n\t" + "ADCS r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + /* Reduce */ + "MOV r6, #0x25\n\t" + "UMAAL r7, r0, r7, r6\n\t" + "MOV r6, #0x13\n\t" + "LSL r0, r0, #1\n\t" + "ORR r0, r0, r7, LSR #31\n\t" + "MUL lr, r0, r6\n\t" + "POP {r0, r1}\n\t" + "MOV r6, #0x26\n\t" + "UMAAL r0, lr, r12, r6\n\t" + "UMAAL r1, lr, r11, r6\n\t" + "MOV r12, r3\n\t" + "MOV r11, r4\n\t" + "POP {r2, r3, r4}\n\t" + "UMAAL r2, lr, r10, r6\n\t" + "UMAAL r3, lr, r12, r6\n\t" + "UMAAL r4, lr, r11, r6\n\t" + "MOV r12, r6\n\t" + "POP {r5, r6}\n\t" + "UMAAL r5, lr, r8, r12\n\t" + "BFC r7, #31, #1\n\t" + "UMAAL r6, lr, r9, r12\n\t" + "ADD r7, r7, lr\n\t" + "POP {lr}\n\t" + /* Store */ + "STM lr, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + : + : + : "memory", "lr" + ); +} + +void fe_sq(fe r, const fe a) +{ + __asm__ __volatile__ ( + "BL fe_sq_op\n\t" + : [r] "+l" (r), [a] "+l" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void fe_mul121666(fe r, fe a) +{ + __asm__ __volatile__ ( + /* Multiply by 121666 */ + "LDM %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "MOV r11, #0xdb42\n\t" + "MOVT r11, #0x1\n\t" + "UMULL r2, r12, r2, r11\n\t" + "SUB r10, r11, #0x1\n\t" + "UMAAL r3, r12, r3, r10\n\t" + "UMAAL r4, r12, r4, r10\n\t" + "UMAAL r5, r12, r5, r10\n\t" + "UMAAL r6, r12, r6, r10\n\t" + "UMAAL r7, r12, r7, r10\n\t" + "UMAAL r8, r12, r8, r10\n\t" + "MOV r11, #0x13\n\t" + "UMAAL r9, r12, r9, r10\n\t" + "LSL r12, r12, #1\n\t" + "ORR r12, r12, r9, LSR #31\n\t" + "MUL r12, r12, r11\n\t" + "ADDS r2, r2, r12\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "BFC r9, #31, #1\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "STM %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + : [r] "+l" (r), [a] "+l" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + ); +} + +#ifndef WC_NO_CACHE_RESISTANT +int curve25519(byte* r, const byte* n, const byte* a) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0xbc\n\t" + "STR %[r], [sp, #160]\n\t" + "STR %[n], [sp, #164]\n\t" + "STR %[a], [sp, #168]\n\t" + "MOV %[n], #0x0\n\t" + "STR %[n], [sp, #172]\n\t" + /* Set one */ + "MOV r10, #0x1\n\t" + "MOV r11, #0x0\n\t" + "STRD r10, r11, [%[r]]\n\t" + "MOV r10, #0x0\n\t" + "STRD r10, r11, [%[r], #8]\n\t" + "STRD r10, r11, [%[r], #16]\n\t" + "STRD r10, r11, [%[r], #24]\n\t" + /* Set zero */ + "MOV r10, #0x0\n\t" + "MOV r11, #0x0\n\t" + "STRD r10, r11, [sp]\n\t" + "STRD r10, r11, [sp, #8]\n\t" + "STRD r10, r11, [sp, #16]\n\t" + "STRD r10, r11, [sp, #24]\n\t" + /* Set one */ + "MOV r10, #0x1\n\t" + "MOV r11, #0x0\n\t" + "STRD r10, r11, [sp, #32]\n\t" + "MOV r10, #0x0\n\t" + "STRD r10, r11, [sp, #40]\n\t" + "STRD r10, r11, [sp, #48]\n\t" + "STRD r10, r11, [sp, #56]\n\t" + "ADD r3, sp, #0x40\n\t" + /* Copy */ + "LDM r2, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "STM r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "MOV %[n], #0x1e\n\t" + "STR %[n], [sp, #180]\n\t" + "MOV %[a], #0x1c\n\t" + "STR %[a], [sp, #176]\n\t" + "\n" + "L_curve25519_words_%=:\n\t" + "\n" + "L_curve25519_bits_%=:\n\t" + "LDR %[n], [sp, #164]\n\t" + "LDR %[a], [%[n], r2]\n\t" + "LDR %[n], [sp, #180]\n\t" + "LSR %[a], %[a], %[n]\n\t" + "AND %[a], %[a], #0x1\n\t" + "STR %[a], [sp, #184]\n\t" + "LDR %[n], [sp, #172]\n\t" + "EOR %[n], %[n], %[a]\n\t" + "STR %[n], [sp, #172]\n\t" + "LDR %[r], [sp, #160]\n\t" + /* Conditional Swap */ + "RSB %[n], %[n], #0x0\n\t" + "LDRD r4, r5, [%[r]]\n\t" + "LDRD r6, r7, [sp, #64]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [%[r]]\n\t" + "STRD r6, r7, [sp, #64]\n\t" + "LDRD r4, r5, [%[r], #8]\n\t" + "LDRD r6, r7, [sp, #72]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [%[r], #8]\n\t" + "STRD r6, r7, [sp, #72]\n\t" + "LDRD r4, r5, [%[r], #16]\n\t" + "LDRD r6, r7, [sp, #80]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [%[r], #16]\n\t" + "STRD r6, r7, [sp, #80]\n\t" + "LDRD r4, r5, [%[r], #24]\n\t" + "LDRD r6, r7, [sp, #88]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [%[r], #24]\n\t" + "STRD r6, r7, [sp, #88]\n\t" + "LDR %[n], [sp, #172]\n\t" + /* Conditional Swap */ + "RSB %[n], %[n], #0x0\n\t" + "LDRD r4, r5, [sp]\n\t" + "LDRD r6, r7, [sp, #32]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [sp]\n\t" + "STRD r6, r7, [sp, #32]\n\t" + "LDRD r4, r5, [sp, #8]\n\t" + "LDRD r6, r7, [sp, #40]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [sp, #8]\n\t" + "STRD r6, r7, [sp, #40]\n\t" + "LDRD r4, r5, [sp, #16]\n\t" + "LDRD r6, r7, [sp, #48]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [sp, #16]\n\t" + "STRD r6, r7, [sp, #48]\n\t" + "LDRD r4, r5, [sp, #24]\n\t" + "LDRD r6, r7, [sp, #56]\n\t" + "EOR r8, r4, r6\n\t" + "EOR r9, r5, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r9\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "STRD r4, r5, [sp, #24]\n\t" + "STRD r6, r7, [sp, #56]\n\t" + "LDR %[n], [sp, #184]\n\t" + "STR %[n], [sp, #172]\n\t" + "MOV r3, sp\n\t" + "LDR r2, [sp, #160]\n\t" + "ADD r1, sp, #0x80\n\t" + "LDR r0, [sp, #160]\n\t" + "BL fe_add_sub_op\n\t" + "ADD r3, sp, #0x20\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "MOV r0, sp\n\t" + "BL fe_add_sub_op\n\t" + "LDR r2, [sp, #160]\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x80\n\t" + "MOV r1, sp\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_sq_op\n\t" + "LDR r1, [sp, #160]\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r3, sp\n\t" + "ADD r2, sp, #0x20\n\t" + "MOV r1, sp\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_add_sub_op\n\t" + "ADD r2, sp, #0x80\n\t" + "ADD r1, sp, #0x60\n\t" + "LDR r0, [sp, #160]\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x80\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sub_op\n\t" + "MOV r1, sp\n\t" + "MOV r0, sp\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul121666\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_add_op\n\t" + "MOV r2, sp\n\t" + "LDR r1, [sp, #168]\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x80\n\t" + "ADD r1, sp, #0x60\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "LDR %[a], [sp, #176]\n\t" + "LDR %[n], [sp, #180]\n\t" + "SUBS %[n], %[n], #0x1\n\t" + "STR %[n], [sp, #180]\n\t" + "BGE L_curve25519_bits_%=\n\t" + "MOV %[n], #0x1f\n\t" + "STR %[n], [sp, #180]\n\t" + "SUBS %[a], %[a], #0x4\n\t" + "STR %[a], [sp, #176]\n\t" + "BGE L_curve25519_words_%=\n\t" + /* Invert */ + "ADD r1, sp, #0x0\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x0\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x4\n\t" + "\n" + "L_curve25519_inv_1_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_1_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x9\n\t" + "\n" + "L_curve25519_inv_2_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_2_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x13\n\t" + "\n" + "L_curve25519_inv_3_%=:\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x80\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_3_%=\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0xa\n\t" + "\n" + "L_curve25519_inv_4_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_4_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x31\n\t" + "\n" + "L_curve25519_inv_5_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_5_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x63\n\t" + "\n" + "L_curve25519_inv_6_%=:\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x80\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_6_%=\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x32\n\t" + "\n" + "L_curve25519_inv_7_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_7_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x5\n\t" + "\n" + "L_curve25519_inv_8_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_8_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x0\n\t" + "BL fe_mul_op\n\t" + "MOV r2, sp\n\t" + "LDR r1, [sp, #160]\n\t" + "LDR r0, [sp, #160]\n\t" + "BL fe_mul_op\n\t" + "MOV r0, #0x0\n\t" + "ADD sp, sp, #0xbc\n\t" + : [r] "+l" (r), [n] "+l" (n), [a] "+l" (a) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr" + ); + return (uint32_t)(size_t)r; +} + +#else +int curve25519(byte* r, const byte* n, const byte* a) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0xc0\n\t" + "STR %[r], [sp, #176]\n\t" + "STR %[n], [sp, #160]\n\t" + "STR %[a], [sp, #172]\n\t" + "ADD r5, sp, #0x40\n\t" + "ADD r4, sp, #0x20\n\t" + "STR sp, [sp, #184]\n\t" + "STR r5, [sp, #180]\n\t" + "STR r4, [sp, #188]\n\t" + "MOV %[n], #0x0\n\t" + "STR %[n], [sp, #164]\n\t" + /* Set one */ + "MOV r10, #0x1\n\t" + "MOV r11, #0x0\n\t" + "STRD r10, r11, [%[r]]\n\t" + "MOV r10, #0x0\n\t" + "STRD r10, r11, [%[r], #8]\n\t" + "STRD r10, r11, [%[r], #16]\n\t" + "STRD r10, r11, [%[r], #24]\n\t" + /* Set zero */ + "MOV r10, #0x0\n\t" + "MOV r11, #0x0\n\t" + "STRD r10, r11, [sp]\n\t" + "STRD r10, r11, [sp, #8]\n\t" + "STRD r10, r11, [sp, #16]\n\t" + "STRD r10, r11, [sp, #24]\n\t" + /* Set one */ + "MOV r10, #0x1\n\t" + "MOV r11, #0x0\n\t" + "STRD r10, r11, [sp, #32]\n\t" + "MOV r10, #0x0\n\t" + "STRD r10, r11, [sp, #40]\n\t" + "STRD r10, r11, [sp, #48]\n\t" + "STRD r10, r11, [sp, #56]\n\t" + "ADD r3, sp, #0x40\n\t" + /* Copy */ + "LDM r2, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "STM r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "MOV %[a], #0xfe\n\t" + "\n" + "L_curve25519_bits_%=:\n\t" + "STR %[a], [sp, #168]\n\t" + "LDR %[n], [sp, #160]\n\t" + "AND r4, %[a], #0x1f\n\t" + "LSR %[a], %[a], #5\n\t" + "LDR %[a], [%[n], r2, LSL #2]\n\t" + "RSB r4, r4, #0x1f\n\t" + "LSL %[a], %[a], r4\n\t" + "LDR %[n], [sp, #164]\n\t" + "EOR %[n], %[n], %[a]\n\t" + "ASR %[n], %[n], #31\n\t" + "STR %[a], [sp, #164]\n\t" + /* Conditional Swap */ + "ADD r11, sp, #0xb0\n\t" + "LDM r11, {r4, r5, r6, r7}\n\t" + "EOR r8, r4, r5\n\t" + "EOR r9, r6, r7\n\t" + "AND r8, r8, %[n]\n\t" + "AND r9, r9, %[n]\n\t" + "EOR r4, r4, r8\n\t" + "EOR r5, r5, r8\n\t" + "EOR r6, r6, r9\n\t" + "EOR r7, r7, r9\n\t" + "STM r11, {r4, r5, r6, r7}\n\t" + /* Ladder step */ + "LDR r3, [sp, #184]\n\t" + "LDR r2, [sp, #176]\n\t" + "ADD r1, sp, #0x80\n\t" + "LDR r0, [sp, #176]\n\t" + "BL fe_add_sub_op\n\t" + "LDR r3, [sp, #188]\n\t" + "LDR r2, [sp, #180]\n\t" + "ADD r1, sp, #0x60\n\t" + "LDR r0, [sp, #184]\n\t" + "BL fe_add_sub_op\n\t" + "LDR r2, [sp, #176]\n\t" + "ADD r1, sp, #0x60\n\t" + "LDR r0, [sp, #188]\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x80\n\t" + "LDR r1, [sp, #184]\n\t" + "LDR r0, [sp, #184]\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "LDR r1, [sp, #176]\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_sq_op\n\t" + "LDR r3, [sp, #184]\n\t" + "LDR r2, [sp, #188]\n\t" + "LDR r1, [sp, #184]\n\t" + "LDR r0, [sp, #180]\n\t" + "BL fe_add_sub_op\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x80\n\t" + "LDR r0, [sp, #176]\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_sub_op\n\t" + "LDR r1, [sp, #184]\n\t" + "LDR r0, [sp, #184]\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x80\n\t" + "LDR r0, [sp, #188]\n\t" + "BL fe_mul121666\n\t" + "LDR r1, [sp, #180]\n\t" + "LDR r0, [sp, #180]\n\t" + "BL fe_sq_op\n\t" + "LDR r2, [sp, #188]\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_add_op\n\t" + "LDR r2, [sp, #184]\n\t" + "LDR r1, [sp, #172]\n\t" + "LDR r0, [sp, #188]\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x80\n\t" + "LDR r0, [sp, #184]\n\t" + "BL fe_mul_op\n\t" + "LDR %[a], [sp, #168]\n\t" + "SUBS %[a], %[a], #0x1\n\t" + "BGE L_curve25519_bits_%=\n\t" + /* Cycle Count: 171 */ + "LDR %[n], [sp, #184]\n\t" + /* Copy */ + "LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "STM sp, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Invert */ + "ADD r1, sp, #0x0\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x0\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x4\n\t" + "\n" + "L_curve25519_inv_1_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_1_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x9\n\t" + "\n" + "L_curve25519_inv_2_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_2_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x13\n\t" + "\n" + "L_curve25519_inv_3_%=:\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x80\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_3_%=\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0xa\n\t" + "\n" + "L_curve25519_inv_4_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_4_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x31\n\t" + "\n" + "L_curve25519_inv_5_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_5_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x80\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x63\n\t" + "\n" + "L_curve25519_inv_6_%=:\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x80\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_6_%=\n\t" + "ADD r2, sp, #0x60\n\t" + "ADD r1, sp, #0x80\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x32\n\t" + "\n" + "L_curve25519_inv_7_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_7_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x5\n\t" + "\n" + "L_curve25519_inv_8_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_curve25519_inv_8_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x0\n\t" + "BL fe_mul_op\n\t" + "LDR r2, [sp, #184]\n\t" + "LDR r1, [sp, #176]\n\t" + "LDR r0, [sp, #176]\n\t" + "BL fe_mul_op\n\t" + /* Ensure result is less than modulus */ + "LDR %[r], [sp, #176]\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "MOV %[a], #0x13\n\t" + "AND %[a], %[a], r11, ASR #31\n\t" + "ADDS r4, r4, %[a]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "BFC r11, #31, #1\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "STM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "MOV r0, #0x0\n\t" + "ADD sp, sp, #0xc0\n\t" + : [r] "+l" (r), [n] "+l" (n), [a] "+l" (a) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12", "lr" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WC_NO_CACHE_RESISTANT */ +#ifdef HAVE_ED25519 +void fe_invert(fe r, const fe a) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x88\n\t" + /* Invert */ + "STR %[r], [sp, #128]\n\t" + "STR %[a], [sp, #132]\n\t" + "LDR r1, [sp, #132]\n\t" + "MOV r0, sp\n\t" + "BL fe_sq_op\n\t" + "MOV r1, sp\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x20\n\t" + "LDR r1, [sp, #132]\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x20\n\t" + "MOV r1, sp\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "MOV r1, sp\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x4\n\t" + "\n" + "L_fe_invert1_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert1_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x9\n\t" + "\n" + "L_fe_invert2_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert2_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x13\n\t" + "\n" + "L_fe_invert3_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert3_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0xa\n\t" + "\n" + "L_fe_invert4_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert4_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x31\n\t" + "\n" + "L_fe_invert5_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert5_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x60\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x63\n\t" + "\n" + "L_fe_invert6_%=:\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x60\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert6_%=\n\t" + "ADD r2, sp, #0x40\n\t" + "ADD r1, sp, #0x60\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x32\n\t" + "\n" + "L_fe_invert7_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert7_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x5\n\t" + "\n" + "L_fe_invert8_%=:\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_invert8_%=\n\t" + "MOV r2, sp\n\t" + "ADD r1, sp, #0x20\n\t" + "LDR r0, [sp, #128]\n\t" + "BL fe_mul_op\n\t" + "LDR %[a], [sp, #132]\n\t" + "LDR %[r], [sp, #128]\n\t" + "ADD sp, sp, #0x88\n\t" + : [r] "+l" (r), [a] "+l" (a) + : + : "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void fe_sq2(fe r, const fe a) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x24\n\t" + "STRD r0, r1, [sp, #28]\n\t" + "LDM r1, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + /* Square * 2 */ + "UMULL r9, r10, r0, r0\n\t" + "UMULL r11, r12, r0, r1\n\t" + "ADDS r11, r11, r11\n\t" + "MOV lr, #0x0\n\t" + "UMAAL r10, r11, lr, lr\n\t" + "STM sp, {r9, r10}\n\t" + "MOV r8, lr\n\t" + "UMAAL r8, r12, r0, r2\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r8, r11, r1, r1\n\t" + "UMULL r9, r10, r0, r3\n\t" + "UMAAL r9, r12, r1, r2\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STRD r8, r9, [sp, #8]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r10, r0, r4\n\t" + "UMAAL r9, r12, r1, r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r2, r2\n\t" + "STR r9, [sp, #16]\n\t" + "UMULL r9, r8, r0, r5\n\t" + "UMAAL r9, r12, r1, r4\n\t" + "UMAAL r9, r10, r2, r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STR r9, [sp, #20]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r8, r0, r6\n\t" + "UMAAL r9, r12, r1, r5\n\t" + "UMAAL r9, r10, r2, r4\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r3, r3\n\t" + "STR r9, [sp, #24]\n\t" + "UMULL r0, r9, r0, r7\n\t" + "UMAAL r0, r8, r1, r6\n\t" + "UMAAL r0, r12, r2, r5\n\t" + "UMAAL r0, r10, r3, r4\n\t" + "ADCS r0, r0, r0\n\t" + "UMAAL r0, r11, lr, lr\n\t" + /* R[7] = r0 */ + "UMAAL r9, r8, r1, r7\n\t" + "UMAAL r9, r10, r2, r6\n\t" + "UMAAL r12, r9, r3, r5\n\t" + "ADCS r12, r12, r12\n\t" + "UMAAL r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "UMAAL r9, r8, r2, r7\n\t" + "UMAAL r10, r9, r3, r6\n\t" + "MOV r2, lr\n\t" + "UMAAL r10, r2, r4, r5\n\t" + "ADCS r10, r10, r10\n\t" + "UMAAL r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "UMAAL r2, r8, r3, r7\n\t" + "UMAAL r2, r9, r4, r6\n\t" + "ADCS r3, r2, r2\n\t" + "UMAAL r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "MOV r1, lr\n\t" + "UMAAL r1, r8, r4, r7\n\t" + "UMAAL r1, r9, r5, r6\n\t" + "ADCS r4, r1, r1\n\t" + "UMAAL r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "UMAAL r8, r9, r5, r7\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "MOV r5, lr\n\t" + "UMAAL r5, r9, r6, r7\n\t" + "ADCS r5, r5, r5\n\t" + "UMAAL r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r5, r7, r7\n\t" + "ADCS r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + /* Reduce */ + "MOV r6, #0x25\n\t" + "UMAAL r7, r0, r7, r6\n\t" + "MOV r6, #0x13\n\t" + "LSL r0, r0, #1\n\t" + "ORR r0, r0, r7, LSR #31\n\t" + "MUL lr, r0, r6\n\t" + "POP {r0, r1}\n\t" + "MOV r6, #0x26\n\t" + "UMAAL r0, lr, r12, r6\n\t" + "UMAAL r1, lr, r11, r6\n\t" + "MOV r12, r3\n\t" + "MOV r11, r4\n\t" + "POP {r2, r3, r4}\n\t" + "UMAAL r2, lr, r10, r6\n\t" + "UMAAL r3, lr, r12, r6\n\t" + "UMAAL r4, lr, r11, r6\n\t" + "MOV r12, r6\n\t" + "POP {r5, r6}\n\t" + "UMAAL r5, lr, r8, r12\n\t" + "BFC r7, #31, #1\n\t" + "UMAAL r6, lr, r9, r12\n\t" + "ADD r7, r7, lr\n\t" + /* Reduce if top bit set */ + "MOV r11, #0x13\n\t" + "AND r12, r11, r7, ASR #31\n\t" + "ADDS r0, r0, r12\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "BFC r7, #31, #1\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADC r7, r7, #0x0\n\t" + /* Double */ + "ADDS r0, r0, r0\n\t" + "ADCS r1, r1, r1\n\t" + "ADCS r2, r2, r2\n\t" + "ADCS r3, r3, r3\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + /* Reduce if top bit set */ + "MOV r11, #0x13\n\t" + "AND r12, r11, r7, ASR #31\n\t" + "ADDS r0, r0, r12\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "BFC r7, #31, #1\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADC r7, r7, #0x0\n\t" + "POP {r12, lr}\n\t" + /* Store */ + "STM r12, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + "MOV r0, r12\n\t" + "MOV r1, lr\n\t" + : [r] "+l" (r), [a] "+l" (a) + : + : "memory", "lr" + ); +} + +void fe_pow22523(fe r, const fe a) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x68\n\t" + /* pow22523 */ + "STR %[r], [sp, #96]\n\t" + "STR %[a], [sp, #100]\n\t" + "LDR r1, [sp, #100]\n\t" + "MOV r0, sp\n\t" + "BL fe_sq_op\n\t" + "MOV r1, sp\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "ADD r2, sp, #0x20\n\t" + "LDR r1, [sp, #100]\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r2, sp, #0x20\n\t" + "MOV r1, sp\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "MOV r1, sp\n\t" + "MOV r0, sp\n\t" + "BL fe_sq_op\n\t" + "MOV r2, sp\n\t" + "ADD r1, sp, #0x20\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "MOV r1, sp\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x4\n\t" + "\n" + "L_fe_pow22523_1_%=:\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_1_%=\n\t" + "MOV r2, sp\n\t" + "ADD r1, sp, #0x20\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "MOV r1, sp\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x9\n\t" + "\n" + "L_fe_pow22523_2_%=:\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_2_%=\n\t" + "MOV r2, sp\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x13\n\t" + "\n" + "L_fe_pow22523_3_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_3_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0xa\n\t" + "\n" + "L_fe_pow22523_4_%=:\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_4_%=\n\t" + "MOV r2, sp\n\t" + "ADD r1, sp, #0x20\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "MOV r1, sp\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x31\n\t" + "\n" + "L_fe_pow22523_5_%=:\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_5_%=\n\t" + "MOV r2, sp\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x40\n\t" + "BL fe_sq_op\n\t" + "MOV r12, #0x63\n\t" + "\n" + "L_fe_pow22523_6_%=:\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x40\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_6_%=\n\t" + "ADD r2, sp, #0x20\n\t" + "ADD r1, sp, #0x40\n\t" + "ADD r0, sp, #0x20\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x32\n\t" + "\n" + "L_fe_pow22523_7_%=:\n\t" + "ADD r1, sp, #0x20\n\t" + "ADD r0, sp, #0x20\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_7_%=\n\t" + "MOV r2, sp\n\t" + "ADD r1, sp, #0x20\n\t" + "MOV r0, sp\n\t" + "BL fe_mul_op\n\t" + "MOV r12, #0x2\n\t" + "\n" + "L_fe_pow22523_8_%=:\n\t" + "MOV r1, sp\n\t" + "MOV r0, sp\n\t" + "PUSH {r12}\n\t" + "BL fe_sq_op\n\t" + "POP {r12}\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_fe_pow22523_8_%=\n\t" + "LDR r2, [sp, #100]\n\t" + "MOV r1, sp\n\t" + "LDR r0, [sp, #96]\n\t" + "BL fe_mul_op\n\t" + "LDR %[a], [sp, #100]\n\t" + "LDR %[r], [sp, #96]\n\t" + "ADD sp, sp, #0x68\n\t" + : [r] "+l" (r), [a] "+l" (a) + : + : "memory", "lr", "r12", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); +} + +void ge_p1p1_to_p2(ge_p2 * r, const ge_p1p1 * p) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x8\n\t" + "STR %[r], [sp]\n\t" + "STR %[p], [sp, #4]\n\t" + "ADD r2, r1, #0x60\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r2, r1, #0x40\n\t" + "ADD r1, r1, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r2, r1, #0x60\n\t" + "ADD r1, r1, #0x40\n\t" + "ADD r0, r0, #0x40\n\t" + "BL fe_mul_op\n\t" + "ADD sp, sp, #0x8\n\t" + : [r] "+l" (r), [p] "+l" (p) + : + : "memory", "lr", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + ); +} + +void ge_p1p1_to_p3(ge_p3 * r, const ge_p1p1 * p) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x8\n\t" + "STR %[r], [sp]\n\t" + "STR %[p], [sp, #4]\n\t" + "ADD r2, r1, #0x60\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r2, r1, #0x40\n\t" + "ADD r1, r1, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r2, r1, #0x60\n\t" + "ADD r1, r1, #0x40\n\t" + "ADD r0, r0, #0x40\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r2, r1, #0x20\n\t" + "ADD r0, r0, #0x60\n\t" + "BL fe_mul_op\n\t" + "ADD sp, sp, #0x8\n\t" + : [r] "+l" (r), [p] "+l" (p) + : + : "memory", "lr", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + ); +} + +void ge_p2_dbl(ge_p1p1 * r, const ge_p2 * p) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x8\n\t" + "STR %[r], [sp]\n\t" + "STR %[p], [sp, #4]\n\t" + "BL fe_sq_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r1, r1, #0x20\n\t" + "ADD r0, r0, #0x40\n\t" + "BL fe_sq_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r2, r1, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_add_op\n\t" + "MOV r1, r0\n\t" + "ADD r0, r0, #0x40\n\t" + "BL fe_sq_op\n\t" + "LDR r0, [sp]\n\t" + "MOV r3, r0\n\t" + "ADD r2, r0, #0x40\n\t" + "ADD r1, r0, #0x40\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "MOV r2, r0\n\t" + "ADD r1, r0, #0x40\n\t" + "SUB r0, r0, #0x20\n\t" + "BL fe_sub_op\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r1, r1, #0x40\n\t" + "ADD r0, r0, #0x60\n\t" + "BL fe_sq2\n\t" + "SUB r2, r0, #0x20\n\t" + "MOV r1, r0\n\t" + "BL fe_sub_op\n\t" + "ADD sp, sp, #0x8\n\t" + : [r] "+l" (r), [p] "+l" (p) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_madd(ge_p1p1 * r, const ge_p3 * p, const ge_precomp * q) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0xc\n\t" + "STR %[r], [sp]\n\t" + "STR %[p], [sp, #4]\n\t" + "STR %[q], [sp, #8]\n\t" + "MOV r2, r1\n\t" + "ADD r1, r1, #0x20\n\t" + "BL fe_add_op\n\t" + "LDR r1, [sp, #4]\n\t" + "MOV r2, r1\n\t" + "ADD r1, r1, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_sub_op\n\t" + "LDR r2, [sp, #8]\n\t" + "SUB r1, r0, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r2, r2, #0x20\n\t" + "ADD r1, r0, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #8]\n\t" + "LDR r2, [sp, #4]\n\t" + "ADD r2, r2, #0x60\n\t" + "ADD r1, r1, #0x40\n\t" + "ADD r0, r0, #0x60\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "ADD r3, r0, #0x20\n\t" + "ADD r2, r0, #0x40\n\t" + "MOV r1, r0\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r1, r1, #0x40\n\t" + "ADD r0, r0, #0x20\n\t" + /* Double */ + "LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "MOV lr, #0x0\n\t" + "ADCS r11, r11, r11\n\t" + "ADC lr, lr, #0x0\n\t" + "MOV r12, #0x13\n\t" + "LSL lr, lr, #1\n\t" + "ORR lr, lr, r11, LSR #31\n\t" + "MUL r12, lr, r12\n\t" + "ADDS r4, r4, r12\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "BFC r11, #31, #1\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "STM r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "ADD r3, r0, #0x20\n\t" + "ADD r1, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "ADD sp, sp, #0xc\n\t" + : [r] "+l" (r), [p] "+l" (p), [q] "+l" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_msub(ge_p1p1 * r, const ge_p3 * p, const ge_precomp * q) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0xc\n\t" + "STR %[r], [sp]\n\t" + "STR %[p], [sp, #4]\n\t" + "STR %[q], [sp, #8]\n\t" + "MOV r2, r1\n\t" + "ADD r1, r1, #0x20\n\t" + "BL fe_add_op\n\t" + "LDR r1, [sp, #4]\n\t" + "MOV r2, r1\n\t" + "ADD r1, r1, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_sub_op\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r2, r2, #0x20\n\t" + "SUB r1, r0, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r1, r0, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #8]\n\t" + "LDR r2, [sp, #4]\n\t" + "ADD r2, r2, #0x60\n\t" + "ADD r1, r1, #0x40\n\t" + "ADD r0, r0, #0x60\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "ADD r3, r0, #0x20\n\t" + "ADD r2, r0, #0x40\n\t" + "MOV r1, r0\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "LDR r1, [sp, #4]\n\t" + "ADD r1, r1, #0x40\n\t" + "ADD r0, r0, #0x20\n\t" + /* Double */ + "LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "MOV lr, #0x0\n\t" + "ADCS r11, r11, r11\n\t" + "ADC lr, lr, #0x0\n\t" + "MOV r12, #0x13\n\t" + "LSL lr, lr, #1\n\t" + "ORR lr, lr, r11, LSR #31\n\t" + "MUL r12, lr, r12\n\t" + "ADDS r4, r4, r12\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "BFC r11, #31, #1\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "STM r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "ADD r3, r0, #0x20\n\t" + "MOV r1, r0\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "ADD sp, sp, #0xc\n\t" + : [r] "+l" (r), [p] "+l" (p), [q] "+l" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_add(ge_p1p1 * r, const ge_p3 * p, const ge_cached* q) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x2c\n\t" + "STR %[r], [sp]\n\t" + "STR %[p], [sp, #4]\n\t" + "STR %[q], [sp, #8]\n\t" + "MOV r3, r1\n\t" + "ADD r2, r1, #0x20\n\t" + "ADD r1, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "LDR r2, [sp, #8]\n\t" + "MOV r1, r0\n\t" + "ADD r0, r0, #0x40\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r2, r2, #0x20\n\t" + "ADD r1, r0, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #8]\n\t" + "LDR r2, [sp, #4]\n\t" + "ADD r2, r2, #0x60\n\t" + "ADD r1, r1, #0x60\n\t" + "ADD r0, r0, #0x60\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r2, r2, #0x40\n\t" + "ADD r1, r1, #0x40\n\t" + "BL fe_mul_op\n\t" + "LDR r1, [sp]\n\t" + "ADD r0, sp, #0xc\n\t" + /* Double */ + "LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "MOV lr, #0x0\n\t" + "ADCS r11, r11, r11\n\t" + "ADC lr, lr, #0x0\n\t" + "MOV r12, #0x13\n\t" + "LSL lr, lr, #1\n\t" + "ORR lr, lr, r11, LSR #31\n\t" + "MUL r12, lr, r12\n\t" + "ADDS r4, r4, r12\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "BFC r11, #31, #1\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "STM r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "ADD r3, r1, #0x20\n\t" + "ADD r2, r1, #0x40\n\t" + "ADD r0, r1, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "ADD r3, r0, #0x40\n\t" + "ADD r2, sp, #0xc\n\t" + "ADD r1, r0, #0x40\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "ADD sp, sp, #0x2c\n\t" + : [r] "+l" (r), [p] "+l" (p), [q] "+l" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void ge_sub(ge_p1p1 * r, const ge_p3 * p, const ge_cached* q) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x2c\n\t" + "STR %[r], [sp]\n\t" + "STR %[p], [sp, #4]\n\t" + "STR %[q], [sp, #8]\n\t" + "MOV r3, r1\n\t" + "ADD r2, r1, #0x20\n\t" + "ADD r1, r0, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r2, r2, #0x20\n\t" + "MOV r1, r0\n\t" + "ADD r0, r0, #0x40\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r1, r0, #0x20\n\t" + "ADD r0, r0, #0x20\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #8]\n\t" + "LDR r2, [sp, #4]\n\t" + "ADD r2, r2, #0x60\n\t" + "ADD r1, r1, #0x60\n\t" + "ADD r0, r0, #0x60\n\t" + "BL fe_mul_op\n\t" + "LDR r0, [sp]\n\t" + "LDR r1, [sp, #4]\n\t" + "LDR r2, [sp, #8]\n\t" + "ADD r2, r2, #0x40\n\t" + "ADD r1, r1, #0x40\n\t" + "BL fe_mul_op\n\t" + "LDR r1, [sp]\n\t" + "ADD r0, sp, #0xc\n\t" + /* Double */ + "LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "MOV lr, #0x0\n\t" + "ADCS r11, r11, r11\n\t" + "ADC lr, lr, #0x0\n\t" + "MOV r12, #0x13\n\t" + "LSL lr, lr, #1\n\t" + "ORR lr, lr, r11, LSR #31\n\t" + "MUL r12, lr, r12\n\t" + "ADDS r4, r4, r12\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "BFC r11, #31, #1\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r11, r11, #0x0\n\t" + "STM r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + /* Done Double */ + "ADD r3, r1, #0x20\n\t" + "ADD r2, r1, #0x40\n\t" + "ADD r0, r1, #0x20\n\t" + "BL fe_add_sub_op\n\t" + "ADD r3, r0, #0x40\n\t" + "ADD r2, sp, #0xc\n\t" + "ADD r1, r0, #0x20\n\t" + "ADD r0, r0, #0x40\n\t" + "BL fe_add_sub_op\n\t" + "ADD sp, sp, #0x2c\n\t" + : [r] "+l" (r), [p] "+l" (p), [q] "+l" (q) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void sc_reduce(byte* s) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x34\n\t" + /* Load bits 252-511 */ + "ADD %[s], %[s], #0x1c\n\t" + "LDM %[s], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "LSR lr, r9, #24\n\t" + "LSL r9, r9, #4\n\t" + "ORR r9, r9, r8, LSR #28\n\t" + "LSL r8, r8, #4\n\t" + "ORR r8, r8, r7, LSR #28\n\t" + "LSL r7, r7, #4\n\t" + "ORR r7, r7, r6, LSR #28\n\t" + "LSL r6, r6, #4\n\t" + "ORR r6, r6, r5, LSR #28\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, r3, LSR #28\n\t" + "LSL r3, r3, #4\n\t" + "ORR r3, r3, r2, LSR #28\n\t" + "LSL r2, r2, #4\n\t" + "ORR r2, r2, r1, LSR #28\n\t" + "BFC r9, #28, #4\n\t" + "SUB %[s], %[s], #0x1c\n\t" + /* Add order times bits 504..511 */ + "MOV r10, #0x2c13\n\t" + "MOVT r10, #0xa30a\n\t" + "MOV r11, #0x9ce5\n\t" + "MOVT r11, #0xa7ed\n\t" + "MOV r1, #0x0\n\t" + "UMLAL r2, r1, r10, lr\n\t" + "UMAAL r3, r1, r11, lr\n\t" + "MOV r10, #0x6329\n\t" + "MOVT r10, #0x5d08\n\t" + "MOV r11, #0x621\n\t" + "MOVT r11, #0xeb21\n\t" + "UMAAL r4, r1, r10, lr\n\t" + "UMAAL r5, r1, r11, lr\n\t" + "ADDS r6, r6, r1\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "SUBS r6, r6, lr\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC r9, r9, #0x0\n\t" + /* Sub product of top 8 words and order */ + "MOV r1, #0x2c13\n\t" + "MOVT r1, #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM %[s]!, {r10, r11, r12}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "UMAAL r11, lr, r3, r1\n\t" + "UMAAL r12, lr, r4, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM %[s]!, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, r1\n\t" + "UMAAL r11, lr, r6, r1\n\t" + "UMAAL r12, lr, r7, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMAAL r10, lr, r8, r1\n\t" + "BFC r11, #28, #4\n\t" + "UMAAL r11, lr, r9, r1\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "SUB sp, sp, #0x20\n\t" + "MOV r1, #0x9ce5\n\t" + "MOVT r1, #0xa7ed\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "UMAAL r11, lr, r3, r1\n\t" + "UMAAL r12, lr, r4, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, r1\n\t" + "UMAAL r11, lr, r6, r1\n\t" + "UMAAL r12, lr, r7, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "UMAAL r10, lr, r8, r1\n\t" + "UMAAL r11, lr, r9, r1\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB sp, sp, #0x20\n\t" + "MOV r1, #0x6329\n\t" + "MOVT r1, #0x5d08\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "UMAAL r11, lr, r3, r1\n\t" + "UMAAL r12, lr, r4, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, r1\n\t" + "UMAAL r11, lr, r6, r1\n\t" + "UMAAL r12, lr, r7, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "UMAAL r10, lr, r8, r1\n\t" + "UMAAL r11, lr, r9, r1\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB sp, sp, #0x20\n\t" + "MOV r1, #0x621\n\t" + "MOVT r1, #0xeb21\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMLAL r10, lr, r2, r1\n\t" + "UMAAL r11, lr, r3, r1\n\t" + "UMAAL r12, lr, r4, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, r1\n\t" + "UMAAL r11, lr, r6, r1\n\t" + "UMAAL r12, lr, r7, r1\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "UMAAL r10, lr, r8, r1\n\t" + "UMAAL r11, lr, r9, r1\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB sp, sp, #0x20\n\t" + /* Subtract at 4 * 32 */ + "LDM sp, {r10, r11, r12}\n\t" + "SUBS r10, r10, r2\n\t" + "SBCS r11, r11, r3\n\t" + "SBCS r12, r12, r4\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "SBCS r10, r10, r5\n\t" + "SBCS r11, r11, r6\n\t" + "SBCS r12, r12, r7\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "SBCS r10, r10, r8\n\t" + "SBC r11, r11, r9\n\t" + "STM sp!, {r10, r11}\n\t" + "SUB sp, sp, #0x24\n\t" + "ASR lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ + "MOV r1, #0xa0000000\n\t" + "MOV r2, #0xba7d\n\t" + "MOVT r2, #0x4b9e\n\t" + "MOV r3, #0x4c63\n\t" + "MOVT r3, #0xcb02\n\t" + "MOV r4, #0xf39a\n\t" + "MOVT r4, #0xd45e\n\t" + "MOV r5, #0xdf3b\n\t" + "MOVT r5, #0x29b\n\t" + "MOV r9, #0x2000000\n\t" + "AND r1, r1, lr\n\t" + "AND r2, r2, lr\n\t" + "AND r3, r3, lr\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r9, r9, lr\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "ADDS r10, r10, r1\n\t" + "ADCS r11, r11, r2\n\t" + "ADCS r12, r12, r3\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "ADCS r10, r10, r4\n\t" + "ADCS r11, r11, r5\n\t" + "ADCS r12, r12, #0x0\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "ADCS r12, r12, r9\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "SUB sp, sp, #0x30\n\t" + "SUB %[s], %[s], #0x10\n\t" + /* Load bits 252-376 */ + "ADD sp, sp, #0x1c\n\t" + "LDM sp, {r1, r2, r3, r4, r5}\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, r3, LSR #28\n\t" + "LSL r3, r3, #4\n\t" + "ORR r3, r3, r2, LSR #28\n\t" + "LSL r2, r2, #4\n\t" + "ORR r2, r2, r1, LSR #28\n\t" + "BFC r5, #29, #3\n\t" + "SUB sp, sp, #0x1c\n\t" + /* Sub product of top 8 words and order */ + /* * -5cf5d3ed */ + "MOV r1, #0x2c13\n\t" + "MOVT r1, #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, lr, r2, r1\n\t" + "UMAAL r7, lr, r3, r1\n\t" + "UMAAL r8, lr, r4, r1\n\t" + "UMAAL r9, lr, r5, r1\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* * -5812631b */ + "MOV r1, #0x9ce5\n\t" + "MOVT r1, #0xa7ed\n\t" + "MOV r10, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, r10, r2, r1\n\t" + "UMAAL r7, r10, r3, r1\n\t" + "UMAAL r8, r10, r4, r1\n\t" + "UMAAL r9, r10, r5, r1\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* * -a2f79cd7 */ + "MOV r1, #0x6329\n\t" + "MOVT r1, #0x5d08\n\t" + "MOV r11, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, r11, r2, r1\n\t" + "UMAAL r7, r11, r3, r1\n\t" + "UMAAL r8, r11, r4, r1\n\t" + "UMAAL r9, r11, r5, r1\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* * -14def9df */ + "MOV r1, #0x621\n\t" + "MOVT r1, #0xeb21\n\t" + "MOV r12, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, r12, r2, r1\n\t" + "UMAAL r7, r12, r3, r1\n\t" + "UMAAL r8, r12, r4, r1\n\t" + "UMAAL r9, r12, r5, r1\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* Add overflows at 4 * 32 */ + "LDM sp, {r6, r7, r8, r9}\n\t" + "BFC r9, #28, #4\n\t" + "ADDS r6, r6, lr\n\t" + "ADCS r7, r7, r10\n\t" + "ADCS r8, r8, r11\n\t" + "ADC r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "SUBS r6, r6, r2\n\t" + "SBCS r7, r7, r3\n\t" + "SBCS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBC r1, r1, r1\n\t" + "SUB sp, sp, #0x10\n\t" + "LDM sp, {r2, r3, r4, r5}\n\t" + "MOV r10, #0xd3ed\n\t" + "MOVT r10, #0x5cf5\n\t" + "MOV r11, #0x631a\n\t" + "MOVT r11, #0x5812\n\t" + "MOV r12, #0x9cd6\n\t" + "MOVT r12, #0xa2f7\n\t" + "MOV lr, #0xf9de\n\t" + "MOVT lr, #0x14de\n\t" + "AND r10, r10, r1\n\t" + "AND r11, r11, r1\n\t" + "AND r12, r12, r1\n\t" + "AND lr, lr, r1\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADCS r5, r5, lr\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "AND r1, r1, #0x10000000\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, r1\n\t" + "BFC r9, #28, #4\n\t" + /* Store result */ + "STM %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x34\n\t" + : [s] "+l" (s) + : + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0x50\n\t" + "ADD lr, sp, #0x44\n\t" + "STM lr, {%[s], %[a], %[c]}\n\t" + "MOV lr, %[b]\n\t" + "LDM %[a], {%[s], %[a], %[b], %[c]}\n\t" + "LDM lr!, {r4, r5, r6}\n\t" + "UMULL r10, r11, %[s], r4\n\t" + "UMULL r12, r7, %[a], r4\n\t" + "UMAAL r11, r12, %[s], r5\n\t" + "UMULL r8, r9, %[b], r4\n\t" + "UMAAL r12, r8, %[a], r5\n\t" + "UMAAL r12, r7, %[s], r6\n\t" + "UMAAL r8, r9, %[c], r4\n\t" + "STM sp, {r10, r11, r12}\n\t" + "UMAAL r7, r8, %[b], r5\n\t" + "LDM lr!, {r4}\n\t" + "UMULL r10, r11, %[a], r6\n\t" + "UMAAL r8, r9, %[b], r6\n\t" + "UMAAL r7, r10, %[s], r4\n\t" + "UMAAL r8, r11, %[c], r5\n\t" + "STR r7, [sp, #12]\n\t" + "UMAAL r8, r10, %[a], r4\n\t" + "UMAAL r9, r11, %[c], r6\n\t" + "UMAAL r9, r10, %[b], r4\n\t" + "UMAAL r10, r11, %[c], r4\n\t" + "LDM lr, {r4, r5, r6, r7}\n\t" + "MOV r12, #0x0\n\t" + "UMLAL r8, r12, %[s], r4\n\t" + "UMAAL r9, r12, %[a], r4\n\t" + "UMAAL r10, r12, %[b], r4\n\t" + "UMAAL r11, r12, %[c], r4\n\t" + "MOV r4, #0x0\n\t" + "UMLAL r9, r4, %[s], r5\n\t" + "UMAAL r10, r4, %[a], r5\n\t" + "UMAAL r11, r4, %[b], r5\n\t" + "UMAAL r12, r4, %[c], r5\n\t" + "MOV r5, #0x0\n\t" + "UMLAL r10, r5, %[s], r6\n\t" + "UMAAL r11, r5, %[a], r6\n\t" + "UMAAL r12, r5, %[b], r6\n\t" + "UMAAL r4, r5, %[c], r6\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r11, r6, %[s], r7\n\t" + "LDR %[s], [sp, #72]\n\t" + "UMAAL r12, r6, %[a], r7\n\t" + "ADD %[s], %[s], #0x10\n\t" + "UMAAL r4, r6, %[b], r7\n\t" + "SUB lr, lr, #0x10\n\t" + "UMAAL r5, r6, %[c], r7\n\t" + "LDM %[s], {%[s], %[a], %[b], %[c]}\n\t" + "STR r6, [sp, #64]\n\t" + "LDM lr!, {r6}\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r8, r7, %[s], r6\n\t" + "UMAAL r9, r7, %[a], r6\n\t" + "STR r8, [sp, #16]\n\t" + "UMAAL r10, r7, %[b], r6\n\t" + "UMAAL r11, r7, %[c], r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r8, #0x0\n\t" + "UMLAL r9, r8, %[s], r6\n\t" + "UMAAL r10, r8, %[a], r6\n\t" + "STR r9, [sp, #20]\n\t" + "UMAAL r11, r8, %[b], r6\n\t" + "UMAAL r12, r8, %[c], r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r10, r9, %[s], r6\n\t" + "UMAAL r11, r9, %[a], r6\n\t" + "STR r10, [sp, #24]\n\t" + "UMAAL r12, r9, %[b], r6\n\t" + "UMAAL r4, r9, %[c], r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r10, #0x0\n\t" + "UMLAL r11, r10, %[s], r6\n\t" + "UMAAL r12, r10, %[a], r6\n\t" + "STR r11, [sp, #28]\n\t" + "UMAAL r4, r10, %[b], r6\n\t" + "UMAAL r5, r10, %[c], r6\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r12, r7, %[s], r11\n\t" + "UMAAL r4, r7, %[a], r11\n\t" + "LDR r6, [sp, #64]\n\t" + "UMAAL r5, r7, %[b], r11\n\t" + "UMAAL r6, r7, %[c], r11\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r4, r8, %[s], r11\n\t" + "UMAAL r5, r8, %[a], r11\n\t" + "UMAAL r6, r8, %[b], r11\n\t" + "UMAAL r7, r8, %[c], r11\n\t" + "LDM lr, {r11, lr}\n\t" + "UMAAL r5, r9, %[s], r11\n\t" + "UMAAL r6, r10, %[s], lr\n\t" + "UMAAL r6, r9, %[a], r11\n\t" + "UMAAL r7, r10, %[a], lr\n\t" + "UMAAL r7, r9, %[b], r11\n\t" + "UMAAL r8, r10, %[b], lr\n\t" + "UMAAL r8, r9, %[c], r11\n\t" + "UMAAL r9, r10, %[c], lr\n\t" + "MOV %[c], r12\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDR %[s], [sp, #68]\n\t" + /* Add c to a * b */ + "LDR lr, [sp, #76]\n\t" + "LDM sp!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "LDM lr!, {%[a], r10, r11, r12}\n\t" + "ADDS %[b], %[b], %[a]\n\t" + "ADCS %[c], %[c], r10\n\t" + "ADCS r4, r4, r11\n\t" + "ADCS r5, r5, r12\n\t" + "LDM lr!, {%[a], r10, r11, r12}\n\t" + "ADCS r6, r6, %[a]\n\t" + "ADCS r7, r7, r10\n\t" + "ADCS r8, r8, r11\n\t" + "ADCS r9, r9, r12\n\t" + "MOV %[a], r9\n\t" + "STM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "LDM sp, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ADCS %[b], %[b], #0x0\n\t" + "ADCS %[c], %[c], #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "SUB sp, sp, #0x20\n\t" + /* Get 252..503 and 504..507 */ + "LSR lr, r9, #24\n\t" + "BFC r9, #24, #8\n\t" + "LSL r9, r9, #4\n\t" + "ORR r9, r9, r8, LSR #28\n\t" + "LSL r8, r8, #4\n\t" + "ORR r8, r8, r7, LSR #28\n\t" + "LSL r7, r7, #4\n\t" + "ORR r7, r7, r6, LSR #28\n\t" + "LSL r6, r6, #4\n\t" + "ORR r6, r6, r5, LSR #28\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, %[c], LSR #28\n\t" + "LSL %[c], %[c], #4\n\t" + "ORR %[c], %[c], %[b], LSR #28\n\t" + "LSL %[b], %[b], #4\n\t" + "ORR %[b], %[b], %[a], LSR #28\n\t" + /* Add order times bits 504..507 */ + "MOV r10, #0x2c13\n\t" + "MOVT r10, #0xa30a\n\t" + "MOV r11, #0x9ce5\n\t" + "MOVT r11, #0xa7ed\n\t" + "MOV %[a], #0x0\n\t" + "UMLAL %[b], %[a], r10, lr\n\t" + "UMAAL %[c], %[a], r11, lr\n\t" + "MOV r10, #0x6329\n\t" + "MOVT r10, #0x5d08\n\t" + "MOV r11, #0x621\n\t" + "MOVT r11, #0xeb21\n\t" + "UMAAL r4, %[a], r10, lr\n\t" + "UMAAL r5, %[a], r11, lr\n\t" + "ADDS r6, r6, %[a]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" + "SUBS r6, r6, lr\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC r9, r9, #0x0\n\t" + /* Sub product of top 8 words and order */ + "MOV %[a], #0x2c13\n\t" + "MOVT %[a], #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM %[s]!, {r10, r11, r12}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "UMAAL r11, lr, %[c], %[a]\n\t" + "UMAAL r12, lr, r4, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM %[s]!, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, %[a]\n\t" + "UMAAL r11, lr, r6, %[a]\n\t" + "UMAAL r12, lr, r7, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM %[s]!, {r10, r11}\n\t" + "UMAAL r10, lr, r8, %[a]\n\t" + "BFC r11, #28, #4\n\t" + "UMAAL r11, lr, r9, %[a]\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB %[s], %[s], #0x10\n\t" + "SUB sp, sp, #0x20\n\t" + "MOV %[a], #0x9ce5\n\t" + "MOVT %[a], #0xa7ed\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "UMAAL r11, lr, %[c], %[a]\n\t" + "UMAAL r12, lr, r4, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, %[a]\n\t" + "UMAAL r11, lr, r6, %[a]\n\t" + "UMAAL r12, lr, r7, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "UMAAL r10, lr, r8, %[a]\n\t" + "UMAAL r11, lr, r9, %[a]\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB sp, sp, #0x20\n\t" + "MOV %[a], #0x6329\n\t" + "MOVT %[a], #0x5d08\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "UMAAL r11, lr, %[c], %[a]\n\t" + "UMAAL r12, lr, r4, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, %[a]\n\t" + "UMAAL r11, lr, r6, %[a]\n\t" + "UMAAL r12, lr, r7, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "UMAAL r10, lr, r8, %[a]\n\t" + "UMAAL r11, lr, r9, %[a]\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB sp, sp, #0x20\n\t" + "MOV %[a], #0x621\n\t" + "MOVT %[a], #0xeb21\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMLAL r10, lr, %[b], %[a]\n\t" + "UMAAL r11, lr, %[c], %[a]\n\t" + "UMAAL r12, lr, r4, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "UMAAL r10, lr, r5, %[a]\n\t" + "UMAAL r11, lr, r6, %[a]\n\t" + "UMAAL r12, lr, r7, %[a]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "UMAAL r10, lr, r8, %[a]\n\t" + "UMAAL r11, lr, r9, %[a]\n\t" + "STM sp!, {r10, r11, lr}\n\t" + "SUB sp, sp, #0x20\n\t" + /* Subtract at 4 * 32 */ + "LDM sp, {r10, r11, r12}\n\t" + "SUBS r10, r10, %[b]\n\t" + "SBCS r11, r11, %[c]\n\t" + "SBCS r12, r12, r4\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "SBCS r10, r10, r5\n\t" + "SBCS r11, r11, r6\n\t" + "SBCS r12, r12, r7\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11}\n\t" + "SBCS r10, r10, r8\n\t" + "SBC r11, r11, r9\n\t" + "STM sp!, {r10, r11}\n\t" + "SUB sp, sp, #0x24\n\t" + "ASR lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ + "MOV %[a], #0xa0000000\n\t" + "MOV %[b], #0xba7d\n\t" + "MOVT %[b], #0x4b9e\n\t" + "MOV %[c], #0x4c63\n\t" + "MOVT %[c], #0xcb02\n\t" + "MOV r4, #0xf39a\n\t" + "MOVT r4, #0xd45e\n\t" + "MOV r5, #0xdf3b\n\t" + "MOVT r5, #0x29b\n\t" + "MOV r9, #0x2000000\n\t" + "AND %[a], %[a], lr\n\t" + "AND %[b], %[b], lr\n\t" + "AND %[c], %[c], lr\n\t" + "AND r4, r4, lr\n\t" + "AND r5, r5, lr\n\t" + "AND r9, r9, lr\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "ADDS r10, r10, %[a]\n\t" + "ADCS r11, r11, %[b]\n\t" + "ADCS r12, r12, %[c]\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "ADCS r10, r10, r4\n\t" + "ADCS r11, r11, r5\n\t" + "ADCS r12, r12, #0x0\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "LDM sp, {r10, r11, r12}\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "ADCS r12, r12, r9\n\t" + "STM sp!, {r10, r11, r12}\n\t" + "SUB sp, sp, #0x30\n\t" + "SUB %[s], %[s], #0x10\n\t" + /* Load bits 252-376 */ + "ADD sp, sp, #0x1c\n\t" + "LDM sp, {%[a], %[b], %[c], r4, r5}\n\t" + "LSL r5, r5, #4\n\t" + "ORR r5, r5, r4, LSR #28\n\t" + "LSL r4, r4, #4\n\t" + "ORR r4, r4, %[c], LSR #28\n\t" + "LSL %[c], %[c], #4\n\t" + "ORR %[c], %[c], %[b], LSR #28\n\t" + "LSL %[b], %[b], #4\n\t" + "ORR %[b], %[b], %[a], LSR #28\n\t" + "BFC r5, #29, #3\n\t" + "SUB sp, sp, #0x1c\n\t" + /* Sub product of top 8 words and order */ + /* * -5cf5d3ed */ + "MOV %[a], #0x2c13\n\t" + "MOVT %[a], #0xa30a\n\t" + "MOV lr, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, lr, %[b], %[a]\n\t" + "UMAAL r7, lr, %[c], %[a]\n\t" + "UMAAL r8, lr, r4, %[a]\n\t" + "UMAAL r9, lr, r5, %[a]\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* * -5812631b */ + "MOV %[a], #0x9ce5\n\t" + "MOVT %[a], #0xa7ed\n\t" + "MOV r10, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, r10, %[b], %[a]\n\t" + "UMAAL r7, r10, %[c], %[a]\n\t" + "UMAAL r8, r10, r4, %[a]\n\t" + "UMAAL r9, r10, r5, %[a]\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* * -a2f79cd7 */ + "MOV %[a], #0x6329\n\t" + "MOVT %[a], #0x5d08\n\t" + "MOV r11, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, r11, %[b], %[a]\n\t" + "UMAAL r7, r11, %[c], %[a]\n\t" + "UMAAL r8, r11, r4, %[a]\n\t" + "UMAAL r9, r11, r5, %[a]\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* * -14def9df */ + "MOV %[a], #0x621\n\t" + "MOVT %[a], #0xeb21\n\t" + "MOV r12, #0x0\n\t" + "LDM sp, {r6, r7, r8, r9}\n\t" + "UMLAL r6, r12, %[b], %[a]\n\t" + "UMAAL r7, r12, %[c], %[a]\n\t" + "UMAAL r8, r12, r4, %[a]\n\t" + "UMAAL r9, r12, r5, %[a]\n\t" + "STM sp, {r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x4\n\t" + /* Add overflows at 4 * 32 */ + "LDM sp, {r6, r7, r8, r9}\n\t" + "BFC r9, #28, #4\n\t" + "ADDS r6, r6, lr\n\t" + "ADCS r7, r7, r10\n\t" + "ADCS r8, r8, r11\n\t" + "ADC r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "SUBS r6, r6, %[b]\n\t" + "SBCS r7, r7, %[c]\n\t" + "SBCS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBC %[a], %[a], %[a]\n\t" + "SUB sp, sp, #0x10\n\t" + "LDM sp, {%[b], %[c], r4, r5}\n\t" + "MOV r10, #0xd3ed\n\t" + "MOVT r10, #0x5cf5\n\t" + "MOV r11, #0x631a\n\t" + "MOVT r11, #0x5812\n\t" + "MOV r12, #0x9cd6\n\t" + "MOVT r12, #0xa2f7\n\t" + "MOV lr, #0xf9de\n\t" + "MOVT lr, #0x14de\n\t" + "AND r10, r10, %[a]\n\t" + "AND r11, r11, %[a]\n\t" + "AND r12, r12, %[a]\n\t" + "AND lr, lr, %[a]\n\t" + "ADDS %[b], %[b], r10\n\t" + "ADCS %[c], %[c], r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADCS r5, r5, lr\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "AND %[a], %[a], #0x10000000\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r9, r9, %[a]\n\t" + "BFC r9, #28, #4\n\t" + /* Store result */ + "STM %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ADD sp, sp, #0x50\n\t" + : [s] "+l" (s), [a] "+l" (a), [b] "+l" (b), [c] "+l" (c) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif /* HAVE_ED25519 */ + +#endif /* !CURVE25519_SMALL || !ED25519_SMALL */ +#endif /* HAVE_CURVE25519 || HAVE_ED25519 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S new file mode 100644 index 0000000000..e9721428e1 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S @@ -0,0 +1,1474 @@ +/* thumb2-sha256-asm + * + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha256.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha256-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifndef NO_SHA256 +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA256_transform_len_k, %object + .size L_SHA256_transform_len_k, 256 + .align 4 +L_SHA256_transform_len_k: + .word 0x428a2f98 + .word 0x71374491 + .word 0xb5c0fbcf + .word 0xe9b5dba5 + .word 0x3956c25b + .word 0x59f111f1 + .word 0x923f82a4 + .word 0xab1c5ed5 + .word 0xd807aa98 + .word 0x12835b01 + .word 0x243185be + .word 0x550c7dc3 + .word 0x72be5d74 + .word 0x80deb1fe + .word 0x9bdc06a7 + .word 0xc19bf174 + .word 0xe49b69c1 + .word 0xefbe4786 + .word 0xfc19dc6 + .word 0x240ca1cc + .word 0x2de92c6f + .word 0x4a7484aa + .word 0x5cb0a9dc + .word 0x76f988da + .word 0x983e5152 + .word 0xa831c66d + .word 0xb00327c8 + .word 0xbf597fc7 + .word 0xc6e00bf3 + .word 0xd5a79147 + .word 0x6ca6351 + .word 0x14292967 + .word 0x27b70a85 + .word 0x2e1b2138 + .word 0x4d2c6dfc + .word 0x53380d13 + .word 0x650a7354 + .word 0x766a0abb + .word 0x81c2c92e + .word 0x92722c85 + .word 0xa2bfe8a1 + .word 0xa81a664b + .word 0xc24b8b70 + .word 0xc76c51a3 + .word 0xd192e819 + .word 0xd6990624 + .word 0xf40e3585 + .word 0x106aa070 + .word 0x19a4c116 + .word 0x1e376c08 + .word 0x2748774c + .word 0x34b0bcb5 + .word 0x391c0cb3 + .word 0x4ed8aa4a + .word 0x5b9cca4f + .word 0x682e6ff3 + .word 0x748f82ee + .word 0x78a5636f + .word 0x84c87814 + .word 0x8cc70208 + .word 0x90befffa + .word 0xa4506ceb + .word 0xbef9a3f7 + .word 0xc67178f2 + .text + .align 4 + .globl Transform_Sha256_Len + .type Transform_Sha256_Len, %function +Transform_Sha256_Len: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xc0 + ADR r3, L_SHA256_transform_len_k + # Copy digest to add in at end + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + LDRD r10, r11, [r0, #24] + STRD r4, r5, [sp, #64] + STRD r6, r7, [sp, #72] + STRD r8, r9, [sp, #80] + STRD r10, r11, [sp, #88] + # Start of loop processing a block +L_SHA256_transform_len_begin: + # Load, Reverse and Store W - 64 bytes + LDRD r4, r5, [r1] + LDRD r6, r7, [r1, #8] + LDRD r8, r9, [r1, #16] + LDRD r10, r11, [r1, #24] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STRD r4, r5, [sp] + STRD r6, r7, [sp, #8] + STRD r8, r9, [sp, #16] + STRD r10, r11, [sp, #24] + LDRD r4, r5, [r1, #32] + LDRD r6, r7, [r1, #40] + LDRD r8, r9, [r1, #48] + LDRD r10, r11, [r1, #56] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STRD r4, r5, [sp, #32] + STRD r6, r7, [sp, #40] + STRD r8, r9, [sp, #48] + STRD r10, r11, [sp, #56] + LDR r11, [r0, #4] + LDR r4, [r0, #8] + EOR r11, r11, r4 + MOV r12, #0x3 + # Start of 16 rounds +L_SHA256_transform_len_start: + # Round 0 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp] + LDR r6, [r3] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + # Calc new W[0] + LDR r6, [sp, #56] + LDR r7, [sp, #36] + LDR r8, [sp, #4] + LDR r9, [sp] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp] + # Round 1 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #4] + LDR r6, [r3, #4] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + # Calc new W[1] + LDR r6, [sp, #60] + LDR r7, [sp, #40] + LDR r8, [sp, #8] + LDR r9, [sp, #4] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #4] + # Round 2 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #8] + LDR r6, [r3, #8] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + # Calc new W[2] + LDR r6, [sp] + LDR r7, [sp, #44] + LDR r8, [sp, #12] + LDR r9, [sp, #8] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #8] + # Round 3 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #12] + LDR r6, [r3, #12] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + # Calc new W[3] + LDR r6, [sp, #4] + LDR r7, [sp, #48] + LDR r8, [sp, #16] + LDR r9, [sp, #12] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #12] + # Round 4 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #16] + LDR r6, [r3, #16] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + # Calc new W[4] + LDR r6, [sp, #8] + LDR r7, [sp, #52] + LDR r8, [sp, #20] + LDR r9, [sp, #16] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #16] + # Round 5 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #20] + LDR r6, [r3, #20] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + # Calc new W[5] + LDR r6, [sp, #12] + LDR r7, [sp, #56] + LDR r8, [sp, #24] + LDR r9, [sp, #20] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #20] + # Round 6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #24] + LDR r6, [r3, #24] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + # Calc new W[6] + LDR r6, [sp, #16] + LDR r7, [sp, #60] + LDR r8, [sp, #28] + LDR r9, [sp, #24] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #24] + # Round 7 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #28] + LDR r6, [r3, #28] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + # Calc new W[7] + LDR r6, [sp, #20] + LDR r7, [sp] + LDR r8, [sp, #32] + LDR r9, [sp, #28] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #28] + # Round 8 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #32] + LDR r6, [r3, #32] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + # Calc new W[8] + LDR r6, [sp, #24] + LDR r7, [sp, #4] + LDR r8, [sp, #36] + LDR r9, [sp, #32] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #32] + # Round 9 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #36] + LDR r6, [r3, #36] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + # Calc new W[9] + LDR r6, [sp, #28] + LDR r7, [sp, #8] + LDR r8, [sp, #40] + LDR r9, [sp, #36] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #36] + # Round 10 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #40] + LDR r6, [r3, #40] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + # Calc new W[10] + LDR r6, [sp, #32] + LDR r7, [sp, #12] + LDR r8, [sp, #44] + LDR r9, [sp, #40] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #40] + # Round 11 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #44] + LDR r6, [r3, #44] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + # Calc new W[11] + LDR r6, [sp, #36] + LDR r7, [sp, #16] + LDR r8, [sp, #48] + LDR r9, [sp, #44] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #44] + # Round 12 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #48] + LDR r6, [r3, #48] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + # Calc new W[12] + LDR r6, [sp, #40] + LDR r7, [sp, #20] + LDR r8, [sp, #52] + LDR r9, [sp, #48] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #48] + # Round 13 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #52] + LDR r6, [r3, #52] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + # Calc new W[13] + LDR r6, [sp, #44] + LDR r7, [sp, #24] + LDR r8, [sp, #56] + LDR r9, [sp, #52] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #52] + # Round 14 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #56] + LDR r6, [r3, #56] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + # Calc new W[14] + LDR r6, [sp, #48] + LDR r7, [sp, #28] + LDR r8, [sp, #60] + LDR r9, [sp, #56] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #56] + # Round 15 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #60] + LDR r6, [r3, #60] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + # Calc new W[15] + LDR r6, [sp, #52] + LDR r7, [sp, #32] + LDR r8, [sp] + LDR r9, [sp, #60] + ROR r4, r6, #17 + ROR r5, r8, #7 + EOR r4, r4, r6, ROR #19 + EOR r5, r5, r8, ROR #18 + EOR r4, r4, r6, LSR #10 + EOR r5, r5, r8, LSR #3 + ADD r9, r9, r7 + ADD r4, r4, r5 + ADD r9, r9, r4 + STR r9, [sp, #60] + ADD r3, r3, #0x40 + SUBS r12, r12, #0x1 + BNE L_SHA256_transform_len_start + # Round 0 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp] + LDR r6, [r3] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + # Round 1 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #4] + LDR r6, [r3, #4] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + # Round 2 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #8] + LDR r6, [r3, #8] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + # Round 3 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #12] + LDR r6, [r3, #12] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + # Round 4 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #16] + LDR r6, [r3, #16] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + # Round 5 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #20] + LDR r6, [r3, #20] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + # Round 6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #24] + LDR r6, [r3, #24] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + # Round 7 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #28] + LDR r6, [r3, #28] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + # Round 8 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r9, [r0, #28] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #32] + LDR r6, [r3, #32] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r8, [r0, #12] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #12] + STR r9, [r0, #28] + # Round 9 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r9, [r0, #24] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #36] + LDR r6, [r3, #36] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r8, [r0, #8] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #8] + STR r9, [r0, #24] + # Round 10 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r9, [r0, #20] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #40] + LDR r6, [r3, #40] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r8, [r0, #4] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #4] + STR r9, [r0, #20] + # Round 11 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r9, [r0, #16] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #44] + LDR r6, [r3, #44] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r8, [r0] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0] + STR r9, [r0, #16] + # Round 12 + LDR r5, [r0] + LDR r6, [r0, #4] + LDR r7, [r0, #8] + LDR r9, [r0, #12] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #48] + LDR r6, [r3, #48] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #16] + LDR r6, [r0, #20] + LDR r7, [r0, #24] + LDR r8, [r0, #28] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #28] + STR r9, [r0, #12] + # Round 13 + LDR r5, [r0, #28] + LDR r6, [r0] + LDR r7, [r0, #4] + LDR r9, [r0, #8] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #52] + LDR r6, [r3, #52] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #24] + STR r9, [r0, #8] + # Round 14 + LDR r5, [r0, #24] + LDR r6, [r0, #28] + LDR r7, [r0] + LDR r9, [r0, #4] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #56] + LDR r6, [r3, #56] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #8] + LDR r6, [r0, #12] + LDR r7, [r0, #16] + LDR r8, [r0, #20] + ROR r4, r5, #2 + EOR r10, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r11, r11, r10 + EOR r4, r4, r5, ROR #22 + EOR r11, r11, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r11 + STR r8, [r0, #20] + STR r9, [r0, #4] + # Round 15 + LDR r5, [r0, #20] + LDR r6, [r0, #24] + LDR r7, [r0, #28] + LDR r9, [r0] + ROR r4, r5, #6 + EOR r6, r6, r7 + EOR r4, r4, r5, ROR #11 + AND r6, r6, r5 + EOR r4, r4, r5, ROR #25 + EOR r6, r6, r7 + ADD r9, r9, r4 + ADD r9, r9, r6 + LDR r5, [sp, #60] + LDR r6, [r3, #60] + ADD r9, r9, r5 + ADD r9, r9, r6 + LDR r5, [r0, #4] + LDR r6, [r0, #8] + LDR r7, [r0, #12] + LDR r8, [r0, #16] + ROR r4, r5, #2 + EOR r11, r5, r6 + EOR r4, r4, r5, ROR #13 + AND r10, r10, r11 + EOR r4, r4, r5, ROR #22 + EOR r10, r10, r6 + ADD r8, r8, r9 + ADD r9, r9, r4 + ADD r9, r9, r10 + STR r8, [r0, #16] + STR r9, [r0] + # Add in digest from start + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [sp, #64] + LDRD r10, r11, [sp, #72] + ADD r4, r4, r8 + ADD r5, r5, r9 + ADD r6, r6, r10 + ADD r7, r7, r11 + STRD r4, r5, [r0] + STRD r6, r7, [r0, #8] + STRD r4, r5, [sp, #64] + STRD r6, r7, [sp, #72] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [sp, #80] + LDRD r10, r11, [sp, #88] + ADD r4, r4, r8 + ADD r5, r5, r9 + ADD r6, r6, r10 + ADD r7, r7, r11 + STRD r4, r5, [r0, #16] + STRD r6, r7, [r0, #24] + STRD r4, r5, [sp, #80] + STRD r6, r7, [sp, #88] + SUBS r2, r2, #0x40 + SUB r3, r3, #0xc0 + ADD r1, r1, #0x40 + BNE L_SHA256_transform_len_begin + ADD sp, sp, #0xc0 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 1866 + .size Transform_Sha256_Len,.-Transform_Sha256_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* !NO_SHA256 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c new file mode 100644 index 0000000000..3eb6ec355c --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c @@ -0,0 +1,1476 @@ +/* thumb2-sha256-asm + * + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha256.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha256-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#ifdef WOLFSSL_ARMASM_INLINE +#ifndef NO_SHA256 +#include + +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint32_t L_SHA256_transform_len_k[] = { + 0x428a2f98, + 0x71374491, + 0xb5c0fbcf, + 0xe9b5dba5, + 0x3956c25b, + 0x59f111f1, + 0x923f82a4, + 0xab1c5ed5, + 0xd807aa98, + 0x12835b01, + 0x243185be, + 0x550c7dc3, + 0x72be5d74, + 0x80deb1fe, + 0x9bdc06a7, + 0xc19bf174, + 0xe49b69c1, + 0xefbe4786, + 0xfc19dc6, + 0x240ca1cc, + 0x2de92c6f, + 0x4a7484aa, + 0x5cb0a9dc, + 0x76f988da, + 0x983e5152, + 0xa831c66d, + 0xb00327c8, + 0xbf597fc7, + 0xc6e00bf3, + 0xd5a79147, + 0x6ca6351, + 0x14292967, + 0x27b70a85, + 0x2e1b2138, + 0x4d2c6dfc, + 0x53380d13, + 0x650a7354, + 0x766a0abb, + 0x81c2c92e, + 0x92722c85, + 0xa2bfe8a1, + 0xa81a664b, + 0xc24b8b70, + 0xc76c51a3, + 0xd192e819, + 0xd6990624, + 0xf40e3585, + 0x106aa070, + 0x19a4c116, + 0x1e376c08, + 0x2748774c, + 0x34b0bcb5, + 0x391c0cb3, + 0x4ed8aa4a, + 0x5b9cca4f, + 0x682e6ff3, + 0x748f82ee, + 0x78a5636f, + 0x84c87814, + 0x8cc70208, + 0x90befffa, + 0xa4506ceb, + 0xbef9a3f7, + 0xc67178f2, +}; + +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len); +void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0xc0\n\t" + "MOV r3, %[L_SHA256_transform_len_k]\n\t" + /* Copy digest to add in at end */ + "LDRD r4, r5, [%[sha256]]\n\t" + "LDRD r6, r7, [%[sha256], #8]\n\t" + "LDRD r8, r9, [%[sha256], #16]\n\t" + "LDRD r10, r11, [%[sha256], #24]\n\t" + "STRD r4, r5, [sp, #64]\n\t" + "STRD r6, r7, [sp, #72]\n\t" + "STRD r8, r9, [sp, #80]\n\t" + "STRD r10, r11, [sp, #88]\n\t" + /* Start of loop processing a block */ + "\n" + "L_SHA256_transform_len_begin_%=:\n\t" + /* Load, Reverse and Store W - 64 bytes */ + "LDRD r4, r5, [%[data]]\n\t" + "LDRD r6, r7, [%[data], #8]\n\t" + "LDRD r8, r9, [%[data], #16]\n\t" + "LDRD r10, r11, [%[data], #24]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STRD r4, r5, [sp]\n\t" + "STRD r6, r7, [sp, #8]\n\t" + "STRD r8, r9, [sp, #16]\n\t" + "STRD r10, r11, [sp, #24]\n\t" + "LDRD r4, r5, [%[data], #32]\n\t" + "LDRD r6, r7, [%[data], #40]\n\t" + "LDRD r8, r9, [%[data], #48]\n\t" + "LDRD r10, r11, [%[data], #56]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STRD r4, r5, [sp, #32]\n\t" + "STRD r6, r7, [sp, #40]\n\t" + "STRD r8, r9, [sp, #48]\n\t" + "STRD r10, r11, [sp, #56]\n\t" + "LDR r11, [%[sha256], #4]\n\t" + "LDR r4, [%[sha256], #8]\n\t" + "EOR r11, r11, r4\n\t" + "MOV r12, #0x3\n\t" + /* Start of 16 rounds */ + "\n" + "L_SHA256_transform_len_start_%=:\n\t" + /* Round 0 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp]\n\t" + "LDR r6, [r3]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Calc new W[0] */ + "LDR r6, [sp, #56]\n\t" + "LDR r7, [sp, #36]\n\t" + "LDR r8, [sp, #4]\n\t" + "LDR r9, [sp]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp]\n\t" + /* Round 1 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #4]\n\t" + "LDR r6, [r3, #4]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Calc new W[1] */ + "LDR r6, [sp, #60]\n\t" + "LDR r7, [sp, #40]\n\t" + "LDR r8, [sp, #8]\n\t" + "LDR r9, [sp, #4]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #4]\n\t" + /* Round 2 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #8]\n\t" + "LDR r6, [r3, #8]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Calc new W[2] */ + "LDR r6, [sp]\n\t" + "LDR r7, [sp, #44]\n\t" + "LDR r8, [sp, #12]\n\t" + "LDR r9, [sp, #8]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #8]\n\t" + /* Round 3 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #12]\n\t" + "LDR r6, [r3, #12]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Calc new W[3] */ + "LDR r6, [sp, #4]\n\t" + "LDR r7, [sp, #48]\n\t" + "LDR r8, [sp, #16]\n\t" + "LDR r9, [sp, #12]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #12]\n\t" + /* Round 4 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #16]\n\t" + "LDR r6, [r3, #16]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Calc new W[4] */ + "LDR r6, [sp, #8]\n\t" + "LDR r7, [sp, #52]\n\t" + "LDR r8, [sp, #20]\n\t" + "LDR r9, [sp, #16]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #16]\n\t" + /* Round 5 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #20]\n\t" + "LDR r6, [r3, #20]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Calc new W[5] */ + "LDR r6, [sp, #12]\n\t" + "LDR r7, [sp, #56]\n\t" + "LDR r8, [sp, #24]\n\t" + "LDR r9, [sp, #20]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #20]\n\t" + /* Round 6 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #24]\n\t" + "LDR r6, [r3, #24]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Calc new W[6] */ + "LDR r6, [sp, #16]\n\t" + "LDR r7, [sp, #60]\n\t" + "LDR r8, [sp, #28]\n\t" + "LDR r9, [sp, #24]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #24]\n\t" + /* Round 7 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #28]\n\t" + "LDR r6, [r3, #28]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Calc new W[7] */ + "LDR r6, [sp, #20]\n\t" + "LDR r7, [sp]\n\t" + "LDR r8, [sp, #32]\n\t" + "LDR r9, [sp, #28]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #28]\n\t" + /* Round 8 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #32]\n\t" + "LDR r6, [r3, #32]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Calc new W[8] */ + "LDR r6, [sp, #24]\n\t" + "LDR r7, [sp, #4]\n\t" + "LDR r8, [sp, #36]\n\t" + "LDR r9, [sp, #32]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #32]\n\t" + /* Round 9 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #36]\n\t" + "LDR r6, [r3, #36]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Calc new W[9] */ + "LDR r6, [sp, #28]\n\t" + "LDR r7, [sp, #8]\n\t" + "LDR r8, [sp, #40]\n\t" + "LDR r9, [sp, #36]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #36]\n\t" + /* Round 10 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #40]\n\t" + "LDR r6, [r3, #40]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Calc new W[10] */ + "LDR r6, [sp, #32]\n\t" + "LDR r7, [sp, #12]\n\t" + "LDR r8, [sp, #44]\n\t" + "LDR r9, [sp, #40]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #40]\n\t" + /* Round 11 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #44]\n\t" + "LDR r6, [r3, #44]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Calc new W[11] */ + "LDR r6, [sp, #36]\n\t" + "LDR r7, [sp, #16]\n\t" + "LDR r8, [sp, #48]\n\t" + "LDR r9, [sp, #44]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #44]\n\t" + /* Round 12 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #48]\n\t" + "LDR r6, [r3, #48]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Calc new W[12] */ + "LDR r6, [sp, #40]\n\t" + "LDR r7, [sp, #20]\n\t" + "LDR r8, [sp, #52]\n\t" + "LDR r9, [sp, #48]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #48]\n\t" + /* Round 13 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #52]\n\t" + "LDR r6, [r3, #52]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Calc new W[13] */ + "LDR r6, [sp, #44]\n\t" + "LDR r7, [sp, #24]\n\t" + "LDR r8, [sp, #56]\n\t" + "LDR r9, [sp, #52]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #52]\n\t" + /* Round 14 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #56]\n\t" + "LDR r6, [r3, #56]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Calc new W[14] */ + "LDR r6, [sp, #48]\n\t" + "LDR r7, [sp, #28]\n\t" + "LDR r8, [sp, #60]\n\t" + "LDR r9, [sp, #56]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #56]\n\t" + /* Round 15 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #60]\n\t" + "LDR r6, [r3, #60]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Calc new W[15] */ + "LDR r6, [sp, #52]\n\t" + "LDR r7, [sp, #32]\n\t" + "LDR r8, [sp]\n\t" + "LDR r9, [sp, #60]\n\t" + "ROR r4, r6, #17\n\t" + "ROR r5, r8, #7\n\t" + "EOR r4, r4, r6, ROR #19\n\t" + "EOR r5, r5, r8, ROR #18\n\t" + "EOR r4, r4, r6, LSR #10\n\t" + "EOR r5, r5, r8, LSR #3\n\t" + "ADD r9, r9, r7\n\t" + "ADD r4, r4, r5\n\t" + "ADD r9, r9, r4\n\t" + "STR r9, [sp, #60]\n\t" + "ADD r3, r3, #0x40\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_SHA256_transform_len_start_%=\n\t" + /* Round 0 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp]\n\t" + "LDR r6, [r3]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Round 1 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #4]\n\t" + "LDR r6, [r3, #4]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Round 2 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #8]\n\t" + "LDR r6, [r3, #8]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Round 3 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #12]\n\t" + "LDR r6, [r3, #12]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Round 4 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #16]\n\t" + "LDR r6, [r3, #16]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Round 5 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #20]\n\t" + "LDR r6, [r3, #20]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Round 6 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #24]\n\t" + "LDR r6, [r3, #24]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Round 7 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #28]\n\t" + "LDR r6, [r3, #28]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Round 8 */ + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r9, [%[sha256], #28]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #32]\n\t" + "LDR r6, [r3, #32]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r8, [%[sha256], #12]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #12]\n\t" + "STR r9, [%[sha256], #28]\n\t" + /* Round 9 */ + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r9, [%[sha256], #24]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #36]\n\t" + "LDR r6, [r3, #36]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r8, [%[sha256], #8]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #8]\n\t" + "STR r9, [%[sha256], #24]\n\t" + /* Round 10 */ + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r9, [%[sha256], #20]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #40]\n\t" + "LDR r6, [r3, #40]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r8, [%[sha256], #4]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #4]\n\t" + "STR r9, [%[sha256], #20]\n\t" + /* Round 11 */ + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r9, [%[sha256], #16]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #44]\n\t" + "LDR r6, [r3, #44]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r8, [%[sha256]]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256]]\n\t" + "STR r9, [%[sha256], #16]\n\t" + /* Round 12 */ + "LDR r5, [%[sha256]]\n\t" + "LDR r6, [%[sha256], #4]\n\t" + "LDR r7, [%[sha256], #8]\n\t" + "LDR r9, [%[sha256], #12]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #48]\n\t" + "LDR r6, [r3, #48]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #16]\n\t" + "LDR r6, [%[sha256], #20]\n\t" + "LDR r7, [%[sha256], #24]\n\t" + "LDR r8, [%[sha256], #28]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #28]\n\t" + "STR r9, [%[sha256], #12]\n\t" + /* Round 13 */ + "LDR r5, [%[sha256], #28]\n\t" + "LDR r6, [%[sha256]]\n\t" + "LDR r7, [%[sha256], #4]\n\t" + "LDR r9, [%[sha256], #8]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #52]\n\t" + "LDR r6, [r3, #52]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #12]\n\t" + "LDR r6, [%[sha256], #16]\n\t" + "LDR r7, [%[sha256], #20]\n\t" + "LDR r8, [%[sha256], #24]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #24]\n\t" + "STR r9, [%[sha256], #8]\n\t" + /* Round 14 */ + "LDR r5, [%[sha256], #24]\n\t" + "LDR r6, [%[sha256], #28]\n\t" + "LDR r7, [%[sha256]]\n\t" + "LDR r9, [%[sha256], #4]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #56]\n\t" + "LDR r6, [r3, #56]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #8]\n\t" + "LDR r6, [%[sha256], #12]\n\t" + "LDR r7, [%[sha256], #16]\n\t" + "LDR r8, [%[sha256], #20]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r10, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r11, r11, r10\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r11, r11, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r11\n\t" + "STR r8, [%[sha256], #20]\n\t" + "STR r9, [%[sha256], #4]\n\t" + /* Round 15 */ + "LDR r5, [%[sha256], #20]\n\t" + "LDR r6, [%[sha256], #24]\n\t" + "LDR r7, [%[sha256], #28]\n\t" + "LDR r9, [%[sha256]]\n\t" + "ROR r4, r5, #6\n\t" + "EOR r6, r6, r7\n\t" + "EOR r4, r4, r5, ROR #11\n\t" + "AND r6, r6, r5\n\t" + "EOR r4, r4, r5, ROR #25\n\t" + "EOR r6, r6, r7\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [sp, #60]\n\t" + "LDR r6, [r3, #60]\n\t" + "ADD r9, r9, r5\n\t" + "ADD r9, r9, r6\n\t" + "LDR r5, [%[sha256], #4]\n\t" + "LDR r6, [%[sha256], #8]\n\t" + "LDR r7, [%[sha256], #12]\n\t" + "LDR r8, [%[sha256], #16]\n\t" + "ROR r4, r5, #2\n\t" + "EOR r11, r5, r6\n\t" + "EOR r4, r4, r5, ROR #13\n\t" + "AND r10, r10, r11\n\t" + "EOR r4, r4, r5, ROR #22\n\t" + "EOR r10, r10, r6\n\t" + "ADD r8, r8, r9\n\t" + "ADD r9, r9, r4\n\t" + "ADD r9, r9, r10\n\t" + "STR r8, [%[sha256], #16]\n\t" + "STR r9, [%[sha256]]\n\t" + /* Add in digest from start */ + "LDRD r4, r5, [%[sha256]]\n\t" + "LDRD r6, r7, [%[sha256], #8]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "LDRD r10, r11, [sp, #72]\n\t" + "ADD r4, r4, r8\n\t" + "ADD r5, r5, r9\n\t" + "ADD r6, r6, r10\n\t" + "ADD r7, r7, r11\n\t" + "STRD r4, r5, [%[sha256]]\n\t" + "STRD r6, r7, [%[sha256], #8]\n\t" + "STRD r4, r5, [sp, #64]\n\t" + "STRD r6, r7, [sp, #72]\n\t" + "LDRD r4, r5, [%[sha256], #16]\n\t" + "LDRD r6, r7, [%[sha256], #24]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "LDRD r10, r11, [sp, #88]\n\t" + "ADD r4, r4, r8\n\t" + "ADD r5, r5, r9\n\t" + "ADD r6, r6, r10\n\t" + "ADD r7, r7, r11\n\t" + "STRD r4, r5, [%[sha256], #16]\n\t" + "STRD r6, r7, [%[sha256], #24]\n\t" + "STRD r4, r5, [sp, #80]\n\t" + "STRD r6, r7, [sp, #88]\n\t" + "SUBS %[len], %[len], #0x40\n\t" + "SUB r3, r3, #0xc0\n\t" + "ADD %[data], %[data], #0x40\n\t" + "BNE L_SHA256_transform_len_begin_%=\n\t" + "ADD sp, sp, #0xc0\n\t" + : [sha256] "+l" (sha256), [data] "+l" (data), [len] "+l" (len) + : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* !NO_SHA256 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S new file mode 100644 index 0000000000..34912f4314 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S @@ -0,0 +1,3669 @@ +/* thumb2-sha512-asm + * + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha512-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifdef WOLFSSL_SHA512 +#ifdef WOLFSSL_ARMASM_NO_NEON + .text + .type L_SHA512_transform_len_k, %object + .size L_SHA512_transform_len_k, 640 + .align 4 +L_SHA512_transform_len_k: + .word 0xd728ae22 + .word 0x428a2f98 + .word 0x23ef65cd + .word 0x71374491 + .word 0xec4d3b2f + .word 0xb5c0fbcf + .word 0x8189dbbc + .word 0xe9b5dba5 + .word 0xf348b538 + .word 0x3956c25b + .word 0xb605d019 + .word 0x59f111f1 + .word 0xaf194f9b + .word 0x923f82a4 + .word 0xda6d8118 + .word 0xab1c5ed5 + .word 0xa3030242 + .word 0xd807aa98 + .word 0x45706fbe + .word 0x12835b01 + .word 0x4ee4b28c + .word 0x243185be + .word 0xd5ffb4e2 + .word 0x550c7dc3 + .word 0xf27b896f + .word 0x72be5d74 + .word 0x3b1696b1 + .word 0x80deb1fe + .word 0x25c71235 + .word 0x9bdc06a7 + .word 0xcf692694 + .word 0xc19bf174 + .word 0x9ef14ad2 + .word 0xe49b69c1 + .word 0x384f25e3 + .word 0xefbe4786 + .word 0x8b8cd5b5 + .word 0xfc19dc6 + .word 0x77ac9c65 + .word 0x240ca1cc + .word 0x592b0275 + .word 0x2de92c6f + .word 0x6ea6e483 + .word 0x4a7484aa + .word 0xbd41fbd4 + .word 0x5cb0a9dc + .word 0x831153b5 + .word 0x76f988da + .word 0xee66dfab + .word 0x983e5152 + .word 0x2db43210 + .word 0xa831c66d + .word 0x98fb213f + .word 0xb00327c8 + .word 0xbeef0ee4 + .word 0xbf597fc7 + .word 0x3da88fc2 + .word 0xc6e00bf3 + .word 0x930aa725 + .word 0xd5a79147 + .word 0xe003826f + .word 0x6ca6351 + .word 0xa0e6e70 + .word 0x14292967 + .word 0x46d22ffc + .word 0x27b70a85 + .word 0x5c26c926 + .word 0x2e1b2138 + .word 0x5ac42aed + .word 0x4d2c6dfc + .word 0x9d95b3df + .word 0x53380d13 + .word 0x8baf63de + .word 0x650a7354 + .word 0x3c77b2a8 + .word 0x766a0abb + .word 0x47edaee6 + .word 0x81c2c92e + .word 0x1482353b + .word 0x92722c85 + .word 0x4cf10364 + .word 0xa2bfe8a1 + .word 0xbc423001 + .word 0xa81a664b + .word 0xd0f89791 + .word 0xc24b8b70 + .word 0x654be30 + .word 0xc76c51a3 + .word 0xd6ef5218 + .word 0xd192e819 + .word 0x5565a910 + .word 0xd6990624 + .word 0x5771202a + .word 0xf40e3585 + .word 0x32bbd1b8 + .word 0x106aa070 + .word 0xb8d2d0c8 + .word 0x19a4c116 + .word 0x5141ab53 + .word 0x1e376c08 + .word 0xdf8eeb99 + .word 0x2748774c + .word 0xe19b48a8 + .word 0x34b0bcb5 + .word 0xc5c95a63 + .word 0x391c0cb3 + .word 0xe3418acb + .word 0x4ed8aa4a + .word 0x7763e373 + .word 0x5b9cca4f + .word 0xd6b2b8a3 + .word 0x682e6ff3 + .word 0x5defb2fc + .word 0x748f82ee + .word 0x43172f60 + .word 0x78a5636f + .word 0xa1f0ab72 + .word 0x84c87814 + .word 0x1a6439ec + .word 0x8cc70208 + .word 0x23631e28 + .word 0x90befffa + .word 0xde82bde9 + .word 0xa4506ceb + .word 0xb2c67915 + .word 0xbef9a3f7 + .word 0xe372532b + .word 0xc67178f2 + .word 0xea26619c + .word 0xca273ece + .word 0x21c0c207 + .word 0xd186b8c7 + .word 0xcde0eb1e + .word 0xeada7dd6 + .word 0xee6ed178 + .word 0xf57d4f7f + .word 0x72176fba + .word 0x6f067aa + .word 0xa2c898a6 + .word 0xa637dc5 + .word 0xbef90dae + .word 0x113f9804 + .word 0x131c471b + .word 0x1b710b35 + .word 0x23047d84 + .word 0x28db77f5 + .word 0x40c72493 + .word 0x32caab7b + .word 0x15c9bebc + .word 0x3c9ebe0a + .word 0x9c100d4c + .word 0x431d67c4 + .word 0xcb3e42b6 + .word 0x4cc5d4be + .word 0xfc657e2a + .word 0x597f299c + .word 0x3ad6faec + .word 0x5fcb6fab + .word 0x4a475817 + .word 0x6c44198c + .text + .align 4 + .globl Transform_Sha512_Len + .type Transform_Sha512_Len, %function +Transform_Sha512_Len: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0xc0 + ADR r3, L_SHA512_transform_len_k + # Copy digest to add in at end + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + LDRD r10, r11, [r0, #24] + STRD r4, r5, [sp, #128] + STRD r6, r7, [sp, #136] + STRD r8, r9, [sp, #144] + STRD r10, r11, [sp, #152] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + LDRD r10, r11, [r0, #56] + STRD r4, r5, [sp, #160] + STRD r6, r7, [sp, #168] + STRD r8, r9, [sp, #176] + STRD r10, r11, [sp, #184] + # Start of loop processing a block +L_SHA512_transform_len_begin: + # Load, Reverse and Store W + LDR r4, [r1] + LDR r5, [r1, #4] + LDR r6, [r1, #8] + LDR r7, [r1, #12] + LDR r8, [r1, #16] + LDR r9, [r1, #20] + LDR r10, [r1, #24] + LDR r11, [r1, #28] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp] + STR r4, [sp, #4] + STR r7, [sp, #8] + STR r6, [sp, #12] + STR r9, [sp, #16] + STR r8, [sp, #20] + STR r11, [sp, #24] + STR r10, [sp, #28] + LDR r4, [r1, #32] + LDR r5, [r1, #36] + LDR r6, [r1, #40] + LDR r7, [r1, #44] + LDR r8, [r1, #48] + LDR r9, [r1, #52] + LDR r10, [r1, #56] + LDR r11, [r1, #60] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp, #32] + STR r4, [sp, #36] + STR r7, [sp, #40] + STR r6, [sp, #44] + STR r9, [sp, #48] + STR r8, [sp, #52] + STR r11, [sp, #56] + STR r10, [sp, #60] + LDR r4, [r1, #64] + LDR r5, [r1, #68] + LDR r6, [r1, #72] + LDR r7, [r1, #76] + LDR r8, [r1, #80] + LDR r9, [r1, #84] + LDR r10, [r1, #88] + LDR r11, [r1, #92] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp, #64] + STR r4, [sp, #68] + STR r7, [sp, #72] + STR r6, [sp, #76] + STR r9, [sp, #80] + STR r8, [sp, #84] + STR r11, [sp, #88] + STR r10, [sp, #92] + LDR r4, [r1, #96] + LDR r5, [r1, #100] + LDR r6, [r1, #104] + LDR r7, [r1, #108] + LDR r8, [r1, #112] + LDR r9, [r1, #116] + LDR r10, [r1, #120] + LDR r11, [r1, #124] + REV r4, r4 + REV r5, r5 + REV r6, r6 + REV r7, r7 + REV r8, r8 + REV r9, r9 + REV r10, r10 + REV r11, r11 + STR r5, [sp, #96] + STR r4, [sp, #100] + STR r7, [sp, #104] + STR r6, [sp, #108] + STR r9, [sp, #112] + STR r8, [sp, #116] + STR r11, [sp, #120] + STR r10, [sp, #124] + # Pre-calc: b ^ c + LDRD r10, r11, [r0, #8] + LDRD r4, r5, [r0, #16] + EOR r10, r10, r4 + EOR r11, r11, r5 + MOV r12, #0x4 + # Start of 16 rounds +L_SHA512_transform_len_start: + # Round 0 + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + # Calc new W[0] + LDRD r4, r5, [sp, #112] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp] + LDRD r8, r9, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp] + LDRD r4, r5, [sp, #8] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp] + # Round 1 + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #8] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + # Calc new W[1] + LDRD r4, r5, [sp, #120] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #8] + LDRD r8, r9, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #8] + LDRD r4, r5, [sp, #16] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #8] + # Round 2 + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #16] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + # Calc new W[2] + LDRD r4, r5, [sp] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #16] + LDRD r8, r9, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #16] + LDRD r4, r5, [sp, #24] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #16] + # Round 3 + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #24] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + # Calc new W[3] + LDRD r4, r5, [sp, #8] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #24] + LDRD r8, r9, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #24] + LDRD r4, r5, [sp, #32] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #24] + # Round 4 + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #32] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + # Calc new W[4] + LDRD r4, r5, [sp, #16] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #32] + LDRD r8, r9, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #32] + LDRD r4, r5, [sp, #40] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #32] + # Round 5 + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #40] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + # Calc new W[5] + LDRD r4, r5, [sp, #24] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #40] + LDRD r8, r9, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #40] + LDRD r4, r5, [sp, #48] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #40] + # Round 6 + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #48] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + # Calc new W[6] + LDRD r4, r5, [sp, #32] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #48] + LDRD r8, r9, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #48] + LDRD r4, r5, [sp, #56] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #48] + # Round 7 + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #56] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + # Calc new W[7] + LDRD r4, r5, [sp, #40] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #56] + LDRD r8, r9, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #56] + LDRD r4, r5, [sp, #64] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #56] + # Round 8 + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #64] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + # Calc new W[8] + LDRD r4, r5, [sp, #48] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #64] + LDRD r8, r9, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #64] + LDRD r4, r5, [sp, #72] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #64] + # Round 9 + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #72] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + # Calc new W[9] + LDRD r4, r5, [sp, #56] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #72] + LDRD r8, r9, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #72] + LDRD r4, r5, [sp, #80] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #72] + # Round 10 + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #80] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + # Calc new W[10] + LDRD r4, r5, [sp, #64] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #80] + LDRD r8, r9, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #80] + LDRD r4, r5, [sp, #88] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #80] + # Round 11 + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #88] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + # Calc new W[11] + LDRD r4, r5, [sp, #72] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #88] + LDRD r8, r9, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #88] + LDRD r4, r5, [sp, #96] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #88] + # Round 12 + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #96] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + # Calc new W[12] + LDRD r4, r5, [sp, #80] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #96] + LDRD r8, r9, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #96] + LDRD r4, r5, [sp, #104] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #96] + # Round 13 + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #104] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + # Calc new W[13] + LDRD r4, r5, [sp, #88] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #104] + LDRD r8, r9, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #104] + LDRD r4, r5, [sp, #112] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #104] + # Round 14 + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #112] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + # Calc new W[14] + LDRD r4, r5, [sp, #96] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #112] + LDRD r8, r9, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #112] + LDRD r4, r5, [sp, #120] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #112] + # Round 15 + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #120] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + # Calc new W[15] + LDRD r4, r5, [sp, #104] + LSRS r6, r4, #19 + LSRS r7, r5, #19 + ORR r7, r7, r4, LSL #13 + ORR r6, r6, r5, LSL #13 + LSLS r8, r4, #3 + LSLS r9, r5, #3 + ORR r9, r9, r4, LSR #29 + ORR r8, r8, r5, LSR #29 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #6 + LSRS r9, r5, #6 + ORR r8, r8, r5, LSL #26 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #120] + LDRD r8, r9, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + ADDS r4, r4, r8 + ADC r5, r5, r9 + STRD r4, r5, [sp, #120] + LDRD r4, r5, [sp] + LSRS r6, r4, #1 + LSRS r7, r5, #1 + ORR r7, r7, r4, LSL #31 + ORR r6, r6, r5, LSL #31 + LSRS r8, r4, #8 + LSRS r9, r5, #8 + ORR r9, r9, r4, LSL #24 + ORR r8, r8, r5, LSL #24 + EOR r7, r7, r9 + EOR r6, r6, r8 + LSRS r8, r4, #7 + LSRS r9, r5, #7 + ORR r8, r8, r5, LSL #25 + EOR r7, r7, r9 + EOR r6, r6, r8 + LDRD r4, r5, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [sp, #120] + ADD r3, r3, #0x80 + SUBS r12, r12, #0x1 + BNE L_SHA512_transform_len_start + # Round 0 + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + # Round 1 + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #8] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + # Round 2 + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #16] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + # Round 3 + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #24] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + # Round 4 + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #32] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + # Round 5 + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #40] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + # Round 6 + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #48] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + # Round 7 + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #56] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + # Round 8 + LDRD r4, r5, [r0, #32] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #56] + LDRD r8, r9, [sp, #64] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #64] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #24] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #56] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0] + STRD r8, r9, [r0, #24] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0] + LDRD r6, r7, [r0, #8] + STRD r4, r5, [r0, #56] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #56] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #56] + MOV r10, r8 + MOV r11, r9 + # Round 9 + LDRD r4, r5, [r0, #24] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + LDRD r4, r5, [r0, #24] + LDRD r6, r7, [r0, #32] + LDRD r8, r9, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #48] + LDRD r8, r9, [sp, #72] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #72] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #16] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #48] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #56] + STRD r8, r9, [r0, #16] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #48] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #56] + LDRD r6, r7, [r0] + STRD r4, r5, [r0, #48] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #48] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #48] + MOV r10, r8 + MOV r11, r9 + # Round 10 + LDRD r4, r5, [r0, #16] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #40] + LDRD r8, r9, [sp, #80] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #80] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #8] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #40] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #48] + STRD r8, r9, [r0, #8] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #40] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #48] + LDRD r6, r7, [r0, #56] + STRD r4, r5, [r0, #40] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #40] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #40] + MOV r10, r8 + MOV r11, r9 + # Round 11 + LDRD r4, r5, [r0, #8] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + LDRD r4, r5, [r0, #8] + LDRD r6, r7, [r0, #16] + LDRD r8, r9, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #32] + LDRD r8, r9, [sp, #88] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #88] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #32] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #40] + STRD r8, r9, [r0] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #32] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #40] + LDRD r6, r7, [r0, #48] + STRD r4, r5, [r0, #32] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #32] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #32] + MOV r10, r8 + MOV r11, r9 + # Round 12 + LDRD r4, r5, [r0] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #24] + LDRD r8, r9, [sp, #96] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #96] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #56] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #24] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #32] + STRD r8, r9, [r0, #56] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #24] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #32] + LDRD r6, r7, [r0, #40] + STRD r4, r5, [r0, #24] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #24] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #24] + MOV r10, r8 + MOV r11, r9 + # Round 13 + LDRD r4, r5, [r0, #56] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + LDRD r4, r5, [r0, #56] + LDRD r6, r7, [r0] + LDRD r8, r9, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #16] + LDRD r8, r9, [sp, #104] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #104] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #48] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #16] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #24] + STRD r8, r9, [r0, #48] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #16] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #24] + LDRD r6, r7, [r0, #32] + STRD r4, r5, [r0, #16] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #16] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #16] + MOV r10, r8 + MOV r11, r9 + # Round 14 + LDRD r4, r5, [r0, #48] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0, #8] + LDRD r8, r9, [sp, #112] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #112] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #40] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0, #8] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #16] + STRD r8, r9, [r0, #40] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0, #8] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #16] + LDRD r6, r7, [r0, #24] + STRD r4, r5, [r0, #8] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0, #8] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0, #8] + MOV r10, r8 + MOV r11, r9 + # Round 15 + LDRD r4, r5, [r0, #40] + LSRS r6, r4, #14 + LSRS r7, r5, #14 + ORR r7, r7, r4, LSL #18 + ORR r6, r6, r5, LSL #18 + LSRS r8, r4, #18 + LSRS r9, r5, #18 + ORR r9, r9, r4, LSL #14 + ORR r8, r8, r5, LSL #14 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #23 + LSLS r9, r5, #23 + ORR r9, r9, r4, LSR #9 + ORR r8, r8, r5, LSR #9 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + LDRD r4, r5, [r0, #40] + LDRD r6, r7, [r0, #48] + LDRD r8, r9, [r0, #56] + EOR r6, r6, r8 + EOR r7, r7, r9 + AND r6, r6, r4 + AND r7, r7, r5 + EOR r6, r6, r8 + EOR r7, r7, r9 + LDRD r4, r5, [r0] + LDRD r8, r9, [sp, #120] + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r6, r7, [r3, #120] + ADDS r4, r4, r8 + ADC r5, r5, r9 + LDRD r8, r9, [r0, #32] + ADDS r4, r4, r6 + ADC r5, r5, r7 + STRD r4, r5, [r0] + ADDS r8, r8, r4 + ADC r9, r9, r5 + LDRD r4, r5, [r0, #8] + STRD r8, r9, [r0, #32] + LSRS r6, r4, #28 + LSRS r7, r5, #28 + ORR r7, r7, r4, LSL #4 + ORR r6, r6, r5, LSL #4 + LSLS r8, r4, #30 + LSLS r9, r5, #30 + ORR r9, r9, r4, LSR #2 + ORR r8, r8, r5, LSR #2 + EOR r6, r6, r8 + EOR r7, r7, r9 + LSLS r8, r4, #25 + LSLS r9, r5, #25 + ORR r9, r9, r4, LSR #7 + ORR r8, r8, r5, LSR #7 + LDRD r4, r5, [r0] + EOR r6, r6, r8 + EOR r7, r7, r9 + ADDS r4, r4, r6 + ADC r5, r5, r7 + LDRD r8, r9, [r0, #8] + LDRD r6, r7, [r0, #16] + STRD r4, r5, [r0] + EOR r8, r8, r6 + EOR r9, r9, r7 + AND r10, r10, r8 + AND r11, r11, r9 + EOR r10, r10, r6 + EOR r11, r11, r7 + LDRD r6, r7, [r0] + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r6, r7, [r0] + MOV r10, r8 + MOV r11, r9 + # Add in digest from start + LDRD r4, r5, [r0] + LDRD r6, r7, [r0, #8] + LDRD r8, r9, [sp, #128] + LDRD r10, r11, [sp, #136] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0] + STRD r6, r7, [r0, #8] + STRD r4, r5, [sp, #128] + STRD r6, r7, [sp, #136] + LDRD r4, r5, [r0, #16] + LDRD r6, r7, [r0, #24] + LDRD r8, r9, [sp, #144] + LDRD r10, r11, [sp, #152] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0, #16] + STRD r6, r7, [r0, #24] + STRD r4, r5, [sp, #144] + STRD r6, r7, [sp, #152] + LDRD r4, r5, [r0, #32] + LDRD r6, r7, [r0, #40] + LDRD r8, r9, [sp, #160] + LDRD r10, r11, [sp, #168] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0, #32] + STRD r6, r7, [r0, #40] + STRD r4, r5, [sp, #160] + STRD r6, r7, [sp, #168] + LDRD r4, r5, [r0, #48] + LDRD r6, r7, [r0, #56] + LDRD r8, r9, [sp, #176] + LDRD r10, r11, [sp, #184] + ADDS r4, r4, r8 + ADC r5, r5, r9 + ADDS r6, r6, r10 + ADC r7, r7, r11 + STRD r4, r5, [r0, #48] + STRD r6, r7, [r0, #56] + STRD r4, r5, [sp, #176] + STRD r6, r7, [sp, #184] + SUBS r2, r2, #0x80 + SUB r3, r3, #0x200 + ADD r1, r1, #0x80 + BNE L_SHA512_transform_len_begin + EOR r0, r0, r0 + ADD sp, sp, #0xc0 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + # Cycle Count = 5021 + .size Transform_Sha512_Len,.-Transform_Sha512_Len +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA512 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c new file mode 100644 index 0000000000..9ec7e190d1 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c @@ -0,0 +1,3591 @@ +/* thumb2-sha512-asm + * + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./sha2/sha512.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-sha512-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#include +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#ifdef WOLFSSL_ARMASM_INLINE +#ifdef WOLFSSL_SHA512 +#include + +#ifdef WOLFSSL_ARMASM_NO_NEON +static const uint64_t L_SHA512_transform_len_k[] = { + 0x428a2f98d728ae22UL, + 0x7137449123ef65cdUL, + 0xb5c0fbcfec4d3b2fUL, + 0xe9b5dba58189dbbcUL, + 0x3956c25bf348b538UL, + 0x59f111f1b605d019UL, + 0x923f82a4af194f9bUL, + 0xab1c5ed5da6d8118UL, + 0xd807aa98a3030242UL, + 0x12835b0145706fbeUL, + 0x243185be4ee4b28cUL, + 0x550c7dc3d5ffb4e2UL, + 0x72be5d74f27b896fUL, + 0x80deb1fe3b1696b1UL, + 0x9bdc06a725c71235UL, + 0xc19bf174cf692694UL, + 0xe49b69c19ef14ad2UL, + 0xefbe4786384f25e3UL, + 0xfc19dc68b8cd5b5UL, + 0x240ca1cc77ac9c65UL, + 0x2de92c6f592b0275UL, + 0x4a7484aa6ea6e483UL, + 0x5cb0a9dcbd41fbd4UL, + 0x76f988da831153b5UL, + 0x983e5152ee66dfabUL, + 0xa831c66d2db43210UL, + 0xb00327c898fb213fUL, + 0xbf597fc7beef0ee4UL, + 0xc6e00bf33da88fc2UL, + 0xd5a79147930aa725UL, + 0x6ca6351e003826fUL, + 0x142929670a0e6e70UL, + 0x27b70a8546d22ffcUL, + 0x2e1b21385c26c926UL, + 0x4d2c6dfc5ac42aedUL, + 0x53380d139d95b3dfUL, + 0x650a73548baf63deUL, + 0x766a0abb3c77b2a8UL, + 0x81c2c92e47edaee6UL, + 0x92722c851482353bUL, + 0xa2bfe8a14cf10364UL, + 0xa81a664bbc423001UL, + 0xc24b8b70d0f89791UL, + 0xc76c51a30654be30UL, + 0xd192e819d6ef5218UL, + 0xd69906245565a910UL, + 0xf40e35855771202aUL, + 0x106aa07032bbd1b8UL, + 0x19a4c116b8d2d0c8UL, + 0x1e376c085141ab53UL, + 0x2748774cdf8eeb99UL, + 0x34b0bcb5e19b48a8UL, + 0x391c0cb3c5c95a63UL, + 0x4ed8aa4ae3418acbUL, + 0x5b9cca4f7763e373UL, + 0x682e6ff3d6b2b8a3UL, + 0x748f82ee5defb2fcUL, + 0x78a5636f43172f60UL, + 0x84c87814a1f0ab72UL, + 0x8cc702081a6439ecUL, + 0x90befffa23631e28UL, + 0xa4506cebde82bde9UL, + 0xbef9a3f7b2c67915UL, + 0xc67178f2e372532bUL, + 0xca273eceea26619cUL, + 0xd186b8c721c0c207UL, + 0xeada7dd6cde0eb1eUL, + 0xf57d4f7fee6ed178UL, + 0x6f067aa72176fbaUL, + 0xa637dc5a2c898a6UL, + 0x113f9804bef90daeUL, + 0x1b710b35131c471bUL, + 0x28db77f523047d84UL, + 0x32caab7b40c72493UL, + 0x3c9ebe0a15c9bebcUL, + 0x431d67c49c100d4cUL, + 0x4cc5d4becb3e42b6UL, + 0x597f299cfc657e2aUL, + 0x5fcb6fab3ad6faecUL, + 0x6c44198c4a475817UL, +}; + +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len); +void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) +{ + __asm__ __volatile__ ( + "SUB sp, sp, #0xc0\n\t" + "MOV r3, %[L_SHA512_transform_len_k]\n\t" + /* Copy digest to add in at end */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r10, r11, [%[sha512], #24]\n\t" + "STRD r4, r5, [sp, #128]\n\t" + "STRD r6, r7, [sp, #136]\n\t" + "STRD r8, r9, [sp, #144]\n\t" + "STRD r10, r11, [sp, #152]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r10, r11, [%[sha512], #56]\n\t" + "STRD r4, r5, [sp, #160]\n\t" + "STRD r6, r7, [sp, #168]\n\t" + "STRD r8, r9, [sp, #176]\n\t" + "STRD r10, r11, [sp, #184]\n\t" + /* Start of loop processing a block */ + "\n" + "L_SHA512_transform_len_begin_%=:\n\t" + /* Load, Reverse and Store W */ + "LDR r4, [%[data]]\n\t" + "LDR r5, [%[data], #4]\n\t" + "LDR r6, [%[data], #8]\n\t" + "LDR r7, [%[data], #12]\n\t" + "LDR r8, [%[data], #16]\n\t" + "LDR r9, [%[data], #20]\n\t" + "LDR r10, [%[data], #24]\n\t" + "LDR r11, [%[data], #28]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp]\n\t" + "STR r4, [sp, #4]\n\t" + "STR r7, [sp, #8]\n\t" + "STR r6, [sp, #12]\n\t" + "STR r9, [sp, #16]\n\t" + "STR r8, [sp, #20]\n\t" + "STR r11, [sp, #24]\n\t" + "STR r10, [sp, #28]\n\t" + "LDR r4, [%[data], #32]\n\t" + "LDR r5, [%[data], #36]\n\t" + "LDR r6, [%[data], #40]\n\t" + "LDR r7, [%[data], #44]\n\t" + "LDR r8, [%[data], #48]\n\t" + "LDR r9, [%[data], #52]\n\t" + "LDR r10, [%[data], #56]\n\t" + "LDR r11, [%[data], #60]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp, #32]\n\t" + "STR r4, [sp, #36]\n\t" + "STR r7, [sp, #40]\n\t" + "STR r6, [sp, #44]\n\t" + "STR r9, [sp, #48]\n\t" + "STR r8, [sp, #52]\n\t" + "STR r11, [sp, #56]\n\t" + "STR r10, [sp, #60]\n\t" + "LDR r4, [%[data], #64]\n\t" + "LDR r5, [%[data], #68]\n\t" + "LDR r6, [%[data], #72]\n\t" + "LDR r7, [%[data], #76]\n\t" + "LDR r8, [%[data], #80]\n\t" + "LDR r9, [%[data], #84]\n\t" + "LDR r10, [%[data], #88]\n\t" + "LDR r11, [%[data], #92]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp, #64]\n\t" + "STR r4, [sp, #68]\n\t" + "STR r7, [sp, #72]\n\t" + "STR r6, [sp, #76]\n\t" + "STR r9, [sp, #80]\n\t" + "STR r8, [sp, #84]\n\t" + "STR r11, [sp, #88]\n\t" + "STR r10, [sp, #92]\n\t" + "LDR r4, [%[data], #96]\n\t" + "LDR r5, [%[data], #100]\n\t" + "LDR r6, [%[data], #104]\n\t" + "LDR r7, [%[data], #108]\n\t" + "LDR r8, [%[data], #112]\n\t" + "LDR r9, [%[data], #116]\n\t" + "LDR r10, [%[data], #120]\n\t" + "LDR r11, [%[data], #124]\n\t" + "REV r4, r4\n\t" + "REV r5, r5\n\t" + "REV r6, r6\n\t" + "REV r7, r7\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" + "REV r10, r10\n\t" + "REV r11, r11\n\t" + "STR r5, [sp, #96]\n\t" + "STR r4, [sp, #100]\n\t" + "STR r7, [sp, #104]\n\t" + "STR r6, [sp, #108]\n\t" + "STR r9, [sp, #112]\n\t" + "STR r8, [sp, #116]\n\t" + "STR r11, [sp, #120]\n\t" + "STR r10, [sp, #124]\n\t" + /* Pre-calc: b ^ c */ + "LDRD r10, r11, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r10, r10, r4\n\t" + "EOR r11, r11, r5\n\t" + "MOV r12, #0x4\n\t" + /* Start of 16 rounds */ + "\n" + "L_SHA512_transform_len_start_%=:\n\t" + /* Round 0 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[0] */ + "LDRD r4, r5, [sp, #112]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp]\n\t" + "LDRD r8, r9, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp]\n\t" + "LDRD r4, r5, [sp, #8]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp]\n\t" + /* Round 1 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #8]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[1] */ + "LDRD r4, r5, [sp, #120]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #8]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #8]\n\t" + "LDRD r4, r5, [sp, #16]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #8]\n\t" + /* Round 2 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #16]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[2] */ + "LDRD r4, r5, [sp]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #16]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #16]\n\t" + "LDRD r4, r5, [sp, #24]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #16]\n\t" + /* Round 3 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #24]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[3] */ + "LDRD r4, r5, [sp, #8]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #24]\n\t" + "LDRD r8, r9, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #24]\n\t" + "LDRD r4, r5, [sp, #32]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #24]\n\t" + /* Round 4 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #32]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[4] */ + "LDRD r4, r5, [sp, #16]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #32]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #32]\n\t" + "LDRD r4, r5, [sp, #40]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #32]\n\t" + /* Round 5 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #40]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[5] */ + "LDRD r4, r5, [sp, #24]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #40]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #40]\n\t" + "LDRD r4, r5, [sp, #48]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #40]\n\t" + /* Round 6 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #48]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[6] */ + "LDRD r4, r5, [sp, #32]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #48]\n\t" + "LDRD r8, r9, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #48]\n\t" + "LDRD r4, r5, [sp, #56]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #48]\n\t" + /* Round 7 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #56]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[7] */ + "LDRD r4, r5, [sp, #40]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #56]\n\t" + "LDRD r8, r9, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #56]\n\t" + "LDRD r4, r5, [sp, #64]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #56]\n\t" + /* Round 8 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #64]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[8] */ + "LDRD r4, r5, [sp, #48]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #64]\n\t" + "LDRD r8, r9, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #64]\n\t" + "LDRD r4, r5, [sp, #72]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #64]\n\t" + /* Round 9 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #72]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[9] */ + "LDRD r4, r5, [sp, #56]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #72]\n\t" + "LDRD r8, r9, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #72]\n\t" + "LDRD r4, r5, [sp, #80]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #72]\n\t" + /* Round 10 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #80]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[10] */ + "LDRD r4, r5, [sp, #64]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #80]\n\t" + "LDRD r8, r9, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #80]\n\t" + "LDRD r4, r5, [sp, #88]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #80]\n\t" + /* Round 11 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #88]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[11] */ + "LDRD r4, r5, [sp, #72]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #88]\n\t" + "LDRD r8, r9, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #88]\n\t" + "LDRD r4, r5, [sp, #96]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #88]\n\t" + /* Round 12 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #96]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[12] */ + "LDRD r4, r5, [sp, #80]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #96]\n\t" + "LDRD r8, r9, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #96]\n\t" + "LDRD r4, r5, [sp, #104]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #96]\n\t" + /* Round 13 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #104]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[13] */ + "LDRD r4, r5, [sp, #88]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #104]\n\t" + "LDRD r8, r9, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #104]\n\t" + "LDRD r4, r5, [sp, #112]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #104]\n\t" + /* Round 14 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #112]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[14] */ + "LDRD r4, r5, [sp, #96]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #112]\n\t" + "LDRD r8, r9, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #112]\n\t" + "LDRD r4, r5, [sp, #120]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #112]\n\t" + /* Round 15 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #120]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Calc new W[15] */ + "LDRD r4, r5, [sp, #104]\n\t" + "LSRS r6, r4, #19\n\t" + "LSRS r7, r5, #19\n\t" + "ORR r7, r7, r4, LSL #13\n\t" + "ORR r6, r6, r5, LSL #13\n\t" + "LSLS r8, r4, #3\n\t" + "LSLS r9, r5, #3\n\t" + "ORR r9, r9, r4, LSR #29\n\t" + "ORR r8, r8, r5, LSR #29\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #6\n\t" + "LSRS r9, r5, #6\n\t" + "ORR r8, r8, r5, LSL #26\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #120]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "STRD r4, r5, [sp, #120]\n\t" + "LDRD r4, r5, [sp]\n\t" + "LSRS r6, r4, #1\n\t" + "LSRS r7, r5, #1\n\t" + "ORR r7, r7, r4, LSL #31\n\t" + "ORR r6, r6, r5, LSL #31\n\t" + "LSRS r8, r4, #8\n\t" + "LSRS r9, r5, #8\n\t" + "ORR r9, r9, r4, LSL #24\n\t" + "ORR r8, r8, r5, LSL #24\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LSRS r8, r4, #7\n\t" + "LSRS r9, r5, #7\n\t" + "ORR r8, r8, r5, LSL #25\n\t" + "EOR r7, r7, r9\n\t" + "EOR r6, r6, r8\n\t" + "LDRD r4, r5, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [sp, #120]\n\t" + "ADD r3, r3, #0x80\n\t" + "SUBS r12, r12, #0x1\n\t" + "BNE L_SHA512_transform_len_start_%=\n\t" + /* Round 0 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 1 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #8]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 2 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #16]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 3 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #24]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 4 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #32]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 5 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #40]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 6 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #48]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 7 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #56]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 8 */ + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp, #64]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #64]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "STRD r8, r9, [%[sha512], #24]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [%[sha512], #56]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 9 */ + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r8, r9, [sp, #72]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #72]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "STRD r8, r9, [%[sha512], #16]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #48]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 10 */ + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #80]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #80]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "STRD r8, r9, [%[sha512], #8]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [%[sha512], #40]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 11 */ + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r8, r9, [sp, #88]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #88]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "STRD r8, r9, [%[sha512]]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #32]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 12 */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #96]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #96]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "STRD r8, r9, [%[sha512], #56]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [%[sha512], #24]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 13 */ + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r4, r5, [%[sha512], #56]\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r8, r9, [sp, #104]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #104]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #48]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #24]\n\t" + "STRD r8, r9, [%[sha512], #48]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #24]\n\t" + "LDRD r6, r7, [%[sha512], #32]\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #16]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 14 */ + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #112]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #112]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #40]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "STRD r8, r9, [%[sha512], #40]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [%[sha512], #8]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Round 15 */ + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LSRS r6, r4, #14\n\t" + "LSRS r7, r5, #14\n\t" + "ORR r7, r7, r4, LSL #18\n\t" + "ORR r6, r6, r5, LSL #18\n\t" + "LSRS r8, r4, #18\n\t" + "LSRS r9, r5, #18\n\t" + "ORR r9, r9, r4, LSL #14\n\t" + "ORR r8, r8, r5, LSL #14\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #23\n\t" + "LSLS r9, r5, #23\n\t" + "ORR r9, r9, r4, LSR #9\n\t" + "ORR r8, r8, r5, LSR #9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "LDRD r4, r5, [%[sha512], #40]\n\t" + "LDRD r6, r7, [%[sha512], #48]\n\t" + "LDRD r8, r9, [%[sha512], #56]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "AND r6, r6, r4\n\t" + "AND r7, r7, r5\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r8, r9, [sp, #120]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r6, r7, [r3, #120]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "LDRD r8, r9, [%[sha512], #32]\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "ADDS r8, r8, r4\n\t" + "ADC r9, r9, r5\n\t" + "LDRD r4, r5, [%[sha512], #8]\n\t" + "STRD r8, r9, [%[sha512], #32]\n\t" + "LSRS r6, r4, #28\n\t" + "LSRS r7, r5, #28\n\t" + "ORR r7, r7, r4, LSL #4\n\t" + "ORR r6, r6, r5, LSL #4\n\t" + "LSLS r8, r4, #30\n\t" + "LSLS r9, r5, #30\n\t" + "ORR r9, r9, r4, LSR #2\n\t" + "ORR r8, r8, r5, LSR #2\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "LSLS r8, r4, #25\n\t" + "LSLS r9, r5, #25\n\t" + "ORR r9, r9, r4, LSR #7\n\t" + "ORR r8, r8, r5, LSR #7\n\t" + "LDRD r4, r5, [%[sha512]]\n\t" + "EOR r6, r6, r8\n\t" + "EOR r7, r7, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADC r5, r5, r7\n\t" + "LDRD r8, r9, [%[sha512], #8]\n\t" + "LDRD r6, r7, [%[sha512], #16]\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "EOR r8, r8, r6\n\t" + "EOR r9, r9, r7\n\t" + "AND r10, r10, r8\n\t" + "AND r11, r11, r9\n\t" + "EOR r10, r10, r6\n\t" + "EOR r11, r11, r7\n\t" + "LDRD r6, r7, [%[sha512]]\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r6, r7, [%[sha512]]\n\t" + "MOV r10, r8\n\t" + "MOV r11, r9\n\t" + /* Add in digest from start */ + "LDRD r4, r5, [%[sha512]]\n\t" + "LDRD r6, r7, [%[sha512], #8]\n\t" + "LDRD r8, r9, [sp, #128]\n\t" + "LDRD r10, r11, [sp, #136]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512]]\n\t" + "STRD r6, r7, [%[sha512], #8]\n\t" + "STRD r4, r5, [sp, #128]\n\t" + "STRD r6, r7, [sp, #136]\n\t" + "LDRD r4, r5, [%[sha512], #16]\n\t" + "LDRD r6, r7, [%[sha512], #24]\n\t" + "LDRD r8, r9, [sp, #144]\n\t" + "LDRD r10, r11, [sp, #152]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512], #16]\n\t" + "STRD r6, r7, [%[sha512], #24]\n\t" + "STRD r4, r5, [sp, #144]\n\t" + "STRD r6, r7, [sp, #152]\n\t" + "LDRD r4, r5, [%[sha512], #32]\n\t" + "LDRD r6, r7, [%[sha512], #40]\n\t" + "LDRD r8, r9, [sp, #160]\n\t" + "LDRD r10, r11, [sp, #168]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512], #32]\n\t" + "STRD r6, r7, [%[sha512], #40]\n\t" + "STRD r4, r5, [sp, #160]\n\t" + "STRD r6, r7, [sp, #168]\n\t" + "LDRD r4, r5, [%[sha512], #48]\n\t" + "LDRD r6, r7, [%[sha512], #56]\n\t" + "LDRD r8, r9, [sp, #176]\n\t" + "LDRD r10, r11, [sp, #184]\n\t" + "ADDS r4, r4, r8\n\t" + "ADC r5, r5, r9\n\t" + "ADDS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STRD r4, r5, [%[sha512], #48]\n\t" + "STRD r6, r7, [%[sha512], #56]\n\t" + "STRD r4, r5, [sp, #176]\n\t" + "STRD r6, r7, [sp, #184]\n\t" + "SUBS %[len], %[len], #0x80\n\t" + "SUB r3, r3, #0x200\n\t" + "ADD %[data], %[data], #0x80\n\t" + "BNE L_SHA512_transform_len_begin_%=\n\t" + "EOR r0, r0, r0\n\t" + "ADD sp, sp, #0xc0\n\t" + : [sha512] "+l" (sha512), [data] "+l" (data), [len] "+l" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ +#endif /* WOLFSSL_SHA512 */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index e9a6719b8b..e17ba65ac4 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -63,7 +63,8 @@ on the specific device platform. #endif -#if !defined(NO_SHA256) && !defined(WOLFSSL_ARMASM) +#if !defined(NO_SHA256) && (!defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_NEON)) #if defined(HAVE_FIPS) && defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index ab774f504f..0c2750a902 100644 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -26,7 +26,9 @@ #include -#if (defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)) && !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_PSOC6_CRYPTO) +#if (defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)) && \ + (!defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON)) && \ + !defined(WOLFSSL_PSOC6_CRYPTO) /* determine if we are using Espressif SHA hardware acceleration */ #undef WOLFSSL_USE_ESP32_CRYPT_HASH_HW diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 3d6c0585e6..8a5bb7e6fa 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -76227,38 +76227,38 @@ static void sp_256_div2_mod_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) static const unsigned char L_sp_256_num_bits_8_table[] = { - 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, }; static int sp_256_num_bits_8(const sp_digit* a_p) @@ -93767,38 +93767,38 @@ static void sp_384_div2_mod_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) static const unsigned char L_sp_384_num_bits_12_table[] = { - 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, }; static int sp_384_num_bits_12(const sp_digit* a_p) @@ -121934,38 +121934,38 @@ static void sp_521_div2_mod_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi #if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) static const unsigned char L_sp_521_num_bits_17_table[] = { - 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, }; static int sp_521_num_bits_17(const sp_digit* a_p) diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 7735f2a7a6..3863ba5fa5 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -31081,8 +31081,10 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t ed25519_test(void) #if !defined(NO_ASN) && defined(HAVE_ED25519_SIGN) wc_ed25519_init_ex(&key3, HEAP_HINT, devId); #endif +#ifdef HAVE_ED25519_MAKE_KEY wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &key); wc_ed25519_make_key(&rng, ED25519_KEY_SIZE, &key2); +#endif /* helper functions for signature and key size */ keySz = wc_ed25519_size(&key); @@ -31251,7 +31253,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t ed25519_test(void) ret = ed25519_test_cert(); if (ret < 0) return ret; -#ifdef WOLFSSL_CERT_GEN +#if defined(WOLFSSL_CERT_GEN) && defined(HAVE_ED25519_MAKE_KEY) ret = ed25519_test_make_cert(); if (ret < 0) return ret; @@ -46473,7 +46475,7 @@ static int myCryptoDevCb(int devIdArg, wc_CryptoInfo* info, void* ctx) info->pk.curve25519.private_key->devId = devIdArg; } #endif /* HAVE_CURVE25519 */ - #ifdef HAVE_ED25519 + #if defined(HAVE_ED25519) && defined(HAVE_ED25519_MAKE_KEY) if (info->pk.type == WC_PK_TYPE_ED25519_KEYGEN) { /* set devId to invalid, so software is used */ info->pk.ed25519kg.key->devId = INVALID_DEVID; diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index 60fbdb4c4f..fb868f2379 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -56,7 +56,7 @@ typedef struct Gcm { } Gcm; WOLFSSL_LOCAL void GenerateM0(Gcm* gcm); -#ifdef WOLFSSL_ARMASM +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON) WOLFSSL_LOCAL void GMULT(byte* X, byte* Y); #endif WOLFSSL_LOCAL void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h index 45daa003f9..cdd27db5dd 100644 --- a/wolfssl/wolfcrypt/fe_operations.h +++ b/wolfssl/wolfcrypt/fe_operations.h @@ -120,28 +120,6 @@ WOLFSSL_LOCAL word64 load_3(const unsigned char *in); WOLFSSL_LOCAL word64 load_4(const unsigned char *in); #ifdef CURVED25519_ASM -WOLFSSL_LOCAL void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, - const fe pz, const fe pt); -WOLFSSL_LOCAL void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, - const fe py, const fe pz, const fe pt); -WOLFSSL_LOCAL void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, - const fe py, const fe pz); -WOLFSSL_LOCAL void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, - const fe py, const fe pz, const fe pt, - const fe qxy2d, const fe qyplusx, - const fe qyminusx); -WOLFSSL_LOCAL void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, - const fe py, const fe pz, const fe pt, - const fe qxy2d, const fe qyplusx, - const fe qyminusx); -WOLFSSL_LOCAL void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, - const fe py, const fe pz, const fe pt, const fe qz, - const fe qt2d, const fe qyplusx, - const fe qyminusx); -WOLFSSL_LOCAL void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, - const fe py, const fe pz, const fe pt, const fe qz, - const fe qt2d, const fe qyplusx, - const fe qyminusx); WOLFSSL_LOCAL void fe_cmov_table(fe* r, fe* base, signed char b); #endif /* CURVED25519_ASM */ #endif /* !CURVE25519_SMALL || !ED25519_SMALL */ diff --git a/wolfssl/wolfcrypt/ge_operations.h b/wolfssl/wolfcrypt/ge_operations.h index 69bd60c7c8..1cf87e4626 100644 --- a/wolfssl/wolfcrypt/ge_operations.h +++ b/wolfssl/wolfcrypt/ge_operations.h @@ -82,7 +82,11 @@ WOLFSSL_LOCAL void sc_reduce(byte* s); WOLFSSL_LOCAL void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c); WOLFSSL_LOCAL void ge_tobytes(unsigned char *s,const ge_p2 *h); +#ifndef GE_P3_TOBYTES_IMPL +#define ge_p3_tobytes(s, h) ge_tobytes((s), (const ge_p2 *)(h)) +#else WOLFSSL_LOCAL void ge_p3_tobytes(unsigned char *s,const ge_p3 *h); +#endif #ifndef ED25519_SMALL @@ -105,9 +109,19 @@ typedef struct { ge Z; ge T2d; } ge_cached; - #endif /* !ED25519_SMALL */ +#ifdef CURVED25519_ASM +void ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p); +void ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p); +void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p); +#define ge_p3_dbl(r, p) ge_p2_dbl((ge_p1p1 *)r, (ge_p2 *)p) +void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q); +void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q); +void ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q); +void ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q); +#endif + #endif /* HAVE_ED25519 */ #endif /* WOLF_CRYPT_GE_OPERATIONS_H */ diff --git a/wolfssl/wolfcrypt/settings.h b/wolfssl/wolfcrypt/settings.h index f2fff6fe80..7e44d312c6 100644 --- a/wolfssl/wolfcrypt/settings.h +++ b/wolfssl/wolfcrypt/settings.h @@ -2229,8 +2229,15 @@ extern void uITRON4_free(void *p) ; /* Ed25519 Configs */ #ifdef HAVE_ED25519 - /* By default enable sign, verify, key export and import */ + /* By default enable make key, sign, verify, key export and import */ + #ifndef NO_ED25519_MAKE_KEY + #undef HAVE_ED25519_MAKE_KEY + #define HAVE_ED25519_MAKE_KEY + #endif #ifndef NO_ED25519_SIGN + #ifndef HAVE_ED25519_MAKE_KEY + #error "Need HAVE_ED25519_MAKE_KEY with HAVE_ED25519_SIGN" + #endif #undef HAVE_ED25519_SIGN #define HAVE_ED25519_SIGN #endif diff --git a/wolfssl/wolfcrypt/sha512.h b/wolfssl/wolfcrypt/sha512.h index 0d9c75bbbf..c4c42d7b75 100644 --- a/wolfssl/wolfcrypt/sha512.h +++ b/wolfssl/wolfcrypt/sha512.h @@ -198,6 +198,23 @@ struct wc_Sha512 { #ifdef WOLFSSL_SHA512 +#ifdef WOLFSSL_ARMASM +#ifdef __aarch64__ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA512 + void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, + word32 len); + #define Transform_Sha512_Len Transform_Sha512_Len_neon +#else + void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data, + word32 len); + #define Transform_Sha512_Len Transform_Sha512_Len_crypto +#endif +#else +extern void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, + word32 len); +#endif +#endif + WOLFSSL_API int wc_InitSha512(wc_Sha512* sha); WOLFSSL_API int wc_InitSha512_ex(wc_Sha512* sha, void* heap, int devId); WOLFSSL_API int wc_Sha512Update(wc_Sha512* sha, const byte* data, word32 len);