From e461bc72b8149e6ef45457c1f2319bea91d30ca7 Mon Sep 17 00:00:00 2001
From: Jacob Barthelmeh <jacob@wolfssl.com>
Date: Fri, 5 Jun 2015 15:39:37 -0600
Subject: [PATCH] curve25519 and ed25519 low memory

---
 configure.ac                      |  22 +-
 src/include.am                    |   8 +
 wolfcrypt/src/curve25519.c        |  68 +--
 wolfcrypt/src/ed25519.c           | 707 ++----------------------------
 wolfcrypt/src/fe_low_mem.c        | 596 +++++++++++++++++++++++++
 wolfcrypt/src/fe_operations.c     |  69 ++-
 wolfcrypt/src/ge_low_mem.c        | 558 +++++++++++++++++++++++
 wolfcrypt/src/ge_operations.c     | 680 +++++++++++++++++++++++++++-
 wolfcrypt/test/test.c             |  24 +-
 wolfssl/wolfcrypt/fe_operations.h |  91 +++-
 wolfssl/wolfcrypt/ge_operations.h |  27 +-
 11 files changed, 2060 insertions(+), 790 deletions(-)
 create mode 100644 wolfcrypt/src/fe_low_mem.c
 create mode 100644 wolfcrypt/src/ge_low_mem.c

diff --git a/configure.ac b/configure.ac
index f15b552c582..d2b073efcfd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -680,6 +680,9 @@ then
 fi
 
 
+# for using memory optimization setting on both curve25519 and ed25519
+ENABLED_CURVED25519_SMALL=no
+
 # CURVE25519
 AC_ARG_ENABLE([curve25519],
     [AS_HELP_STRING([--enable-curve25519],[Enable Curve25519 (default: disabled)])],
@@ -687,10 +690,18 @@ AC_ARG_ENABLE([curve25519],
     [ ENABLED_CURVE25519=no ]
     )
 
+
+if test "$ENABLED_CURVE25519" = "small"
+then
+    AM_CFLAGS="$AM_CFLAGS -DCURVED25519_SMALL"
+    ENABLED_CURVED25519_SMALL=yes
+    ENABLED_CURVE25519=yes
+fi
+
 if test "$ENABLED_CURVE25519" = "yes"
 then
-    ENABLED_FEMATH=yes
     AM_CFLAGS="$AM_CFLAGS -DHAVE_CURVE25519"
+    ENABLED_FEMATH=yes
 fi
 
 
@@ -705,6 +716,13 @@ AC_ARG_ENABLE([ed25519],
     )
 
 
+if test "$ENABLED_ED25519" = "small"
+then
+    AM_CFLAGS="$AM_CFLAGS -DCURVED25519_SMALL"
+    ENABLED_CURVED25519_SMALL=yes
+    ENABLED_ED25519=yes
+fi
+
 if test "$ENABLED_ED25519" = "yes"
 then
     if test "$ENABLED_SHA512" = "no"
@@ -716,8 +734,8 @@ then
     AM_CFLAGS="$AM_CFLAGS -DHAVE_ED25519"
 fi
 
-
 AM_CONDITIONAL([BUILD_ED25519], [test "x$ENABLED_ED25519" = "xyes"])
+AM_CONDITIONAL([BUILD_CURVED25519_SMALL], [test "x$ENABLED_CURVED25519_SMALL" = "xyes"])
 AM_CONDITIONAL([BUILD_FEMATH],  [test "x$ENABLED_FEMATH" = "xyes"])
 AM_CONDITIONAL([BUILD_GEMATH],  [test "x$ENABLED_GEMATH" = "xyes"])
 
diff --git a/src/include.am b/src/include.am
index a89d7d4722f..8081e5e4c7a 100644
--- a/src/include.am
+++ b/src/include.am
@@ -176,12 +176,20 @@ src_libwolfssl_la_SOURCES += wolfcrypt/src/ed25519.c
 endif
 
 if BUILD_FEMATH
+if BUILD_CURVED25519_SMALL
+src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_low_mem.c
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/fe_operations.c
 endif
+endif
 
 if BUILD_GEMATH
+if BUILD_CURVED25519_SMALL
+src_libwolfssl_la_SOURCES += wolfcrypt/src/ge_low_mem.c
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/ge_operations.c
 endif
+endif
 
 if BUILD_LIBZ
 src_libwolfssl_la_SOURCES += wolfcrypt/src/compress.c
diff --git a/wolfcrypt/src/curve25519.c b/wolfcrypt/src/curve25519.c
index 74fb53c8366..b745a046cd2 100644
--- a/wolfcrypt/src/curve25519.c
+++ b/wolfcrypt/src/curve25519.c
@@ -46,72 +46,6 @@ const curve25519_set_type curve25519_sets[] = {
 };
 
 
-/* internal function */
-static int curve25519(unsigned char* q, unsigned char* n, unsigned char* p)
-{
-  unsigned char e[32];
-  unsigned int i;
-  fe x1;
-  fe x2;
-  fe z2;
-  fe x3;
-  fe z3;
-  fe tmp0;
-  fe tmp1;
-  int pos;
-  unsigned int swap;
-  unsigned int b;
-
-  for (i = 0;i < 32;++i) e[i] = n[i];
-  e[0] &= 248;
-  e[31] &= 127;
-  e[31] |= 64;
-
-  fe_frombytes(x1,p);
-  fe_1(x2);
-  fe_0(z2);
-  fe_copy(x3,x1);
-  fe_1(z3);
-
-  swap = 0;
-  for (pos = 254;pos >= 0;--pos) {
-    b = e[pos / 8] >> (pos & 7);
-    b &= 1;
-    swap ^= b;
-    fe_cswap(x2,x3,swap);
-    fe_cswap(z2,z3,swap);
-    swap = b;
-
-    /* montgomery */
-	fe_sub(tmp0,x3,z3);
-	fe_sub(tmp1,x2,z2);
-	fe_add(x2,x2,z2);
-	fe_add(z2,x3,z3);
-	fe_mul(z3,tmp0,x2);
-	fe_mul(z2,z2,tmp1);
-	fe_sq(tmp0,tmp1);
-	fe_sq(tmp1,x2);
-	fe_add(x3,z3,z2);
-	fe_sub(z2,z3,z2);
-	fe_mul(x2,tmp1,tmp0);
-	fe_sub(tmp1,tmp1,tmp0);
-	fe_sq(z2,z2);
-	fe_mul121666(z3,tmp1);
-	fe_sq(x3,x3);
-	fe_add(tmp0,tmp0,z3);
-	fe_mul(z3,x1,z2);
-	fe_mul(z2,tmp1,tmp0);
-  }
-  fe_cswap(x2,x3,swap);
-  fe_cswap(z2,z3,swap);
-
-  fe_invert(z2,z2);
-  fe_mul(x2,x2,z2);
-  fe_tobytes(q,x2);
-
-  return 0;
-}
-
 
 int wc_curve25519_make_key(RNG* rng, int keysize, curve25519_key* key)
 {
@@ -138,7 +72,7 @@ int wc_curve25519_make_key(RNG* rng, int keysize, curve25519_key* key)
   key->k.point[31] &= 127;
   key->k.point[31] |= 64;
 
-  /*compute public key*/
+  /* compute public key */
   ret = curve25519(p, key->k.point, basepoint);
 
   /* store keys in big endian format */
diff --git a/wolfcrypt/src/ed25519.c b/wolfcrypt/src/ed25519.c
index 54dc25b9728..ba0dcbe534e 100644
--- a/wolfcrypt/src/ed25519.c
+++ b/wolfcrypt/src/ed25519.c
@@ -39,636 +39,6 @@
 #endif
 
 
-void sc_reduce(byte* s);
-void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c);
-
-/*
-Input:
-  s[0]+256*s[1]+...+256^63*s[63] = s
-
-Output:
-  s[0]+256*s[1]+...+256^31*s[31] = s mod l
-  where l = 2^252 + 27742317777372353535851937790883648493.
-  Overwrites s in place.
-*/
-
-void sc_reduce(byte* s)
-{
-  int64_t s0 = 2097151 & load_3(s);
-  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
-  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
-  int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
-  int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
-  int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
-  int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
-  int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
-  int64_t s8 = 2097151 & load_3(s + 21);
-  int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
-  int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
-  int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
-  int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
-  int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
-  int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
-  int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
-  int64_t s16 = 2097151 & load_3(s + 42);
-  int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
-  int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
-  int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
-  int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
-  int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
-  int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
-  int64_t s23 = (load_4(s + 60) >> 3);
-  int64_t carry0;
-  int64_t carry1;
-  int64_t carry2;
-  int64_t carry3;
-  int64_t carry4;
-  int64_t carry5;
-  int64_t carry6;
-  int64_t carry7;
-  int64_t carry8;
-  int64_t carry9;
-  int64_t carry10;
-  int64_t carry11;
-  int64_t carry12;
-  int64_t carry13;
-  int64_t carry14;
-  int64_t carry15;
-  int64_t carry16;
-
-  s11 += s23 * 666643;
-  s12 += s23 * 470296;
-  s13 += s23 * 654183;
-  s14 -= s23 * 997805;
-  s15 += s23 * 136657;
-  s16 -= s23 * 683901;
-  s23 = 0;
-
-  s10 += s22 * 666643;
-  s11 += s22 * 470296;
-  s12 += s22 * 654183;
-  s13 -= s22 * 997805;
-  s14 += s22 * 136657;
-  s15 -= s22 * 683901;
-  s22 = 0;
-
-  s9 += s21 * 666643;
-  s10 += s21 * 470296;
-  s11 += s21 * 654183;
-  s12 -= s21 * 997805;
-  s13 += s21 * 136657;
-  s14 -= s21 * 683901;
-  s21 = 0;
-
-  s8 += s20 * 666643;
-  s9 += s20 * 470296;
-  s10 += s20 * 654183;
-  s11 -= s20 * 997805;
-  s12 += s20 * 136657;
-  s13 -= s20 * 683901;
-  s20 = 0;
-
-  s7 += s19 * 666643;
-  s8 += s19 * 470296;
-  s9 += s19 * 654183;
-  s10 -= s19 * 997805;
-  s11 += s19 * 136657;
-  s12 -= s19 * 683901;
-  s19 = 0;
-
-  s6 += s18 * 666643;
-  s7 += s18 * 470296;
-  s8 += s18 * 654183;
-  s9 -= s18 * 997805;
-  s10 += s18 * 136657;
-  s11 -= s18 * 683901;
-  s18 = 0;
-
-  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
-  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
-  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
-  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
-
-  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
-  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
-  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
-
-  s5 += s17 * 666643;
-  s6 += s17 * 470296;
-  s7 += s17 * 654183;
-  s8 -= s17 * 997805;
-  s9 += s17 * 136657;
-  s10 -= s17 * 683901;
-  s17 = 0;
-
-  s4 += s16 * 666643;
-  s5 += s16 * 470296;
-  s6 += s16 * 654183;
-  s7 -= s16 * 997805;
-  s8 += s16 * 136657;
-  s9 -= s16 * 683901;
-  s16 = 0;
-
-  s3 += s15 * 666643;
-  s4 += s15 * 470296;
-  s5 += s15 * 654183;
-  s6 -= s15 * 997805;
-  s7 += s15 * 136657;
-  s8 -= s15 * 683901;
-  s15 = 0;
-
-  s2 += s14 * 666643;
-  s3 += s14 * 470296;
-  s4 += s14 * 654183;
-  s5 -= s14 * 997805;
-  s6 += s14 * 136657;
-  s7 -= s14 * 683901;
-  s14 = 0;
-
-  s1 += s13 * 666643;
-  s2 += s13 * 470296;
-  s3 += s13 * 654183;
-  s4 -= s13 * 997805;
-  s5 += s13 * 136657;
-  s6 -= s13 * 683901;
-  s13 = 0;
-
-  s0 += s12 * 666643;
-  s1 += s12 * 470296;
-  s2 += s12 * 654183;
-  s3 -= s12 * 997805;
-  s4 += s12 * 136657;
-  s5 -= s12 * 683901;
-  s12 = 0;
-
-  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
-  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
-  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
-  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
-
-  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
-  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
-  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
-  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
-
-  s0 += s12 * 666643;
-  s1 += s12 * 470296;
-  s2 += s12 * 654183;
-  s3 -= s12 * 997805;
-  s4 += s12 * 136657;
-  s5 -= s12 * 683901;
-  s12 = 0;
-
-  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
-  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
-  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
-  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
-  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
-  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
-  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
-  carry11 = s11 >> 21; s12 += carry11; s11 -= carry11 << 21;
-
-  s0 += s12 * 666643;
-  s1 += s12 * 470296;
-  s2 += s12 * 654183;
-  s3 -= s12 * 997805;
-  s4 += s12 * 136657;
-  s5 -= s12 * 683901;
-  s12 = 0;
-
-  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
-  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
-  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
-  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
-  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
-  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
-  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
-
-  s[0] = s0 >> 0;
-  s[1] = s0 >> 8;
-  s[2] = (s0 >> 16) | (s1 << 5);
-  s[3] = s1 >> 3;
-  s[4] = s1 >> 11;
-  s[5] = (s1 >> 19) | (s2 << 2);
-  s[6] = s2 >> 6;
-  s[7] = (s2 >> 14) | (s3 << 7);
-  s[8] = s3 >> 1;
-  s[9] = s3 >> 9;
-  s[10] = (s3 >> 17) | (s4 << 4);
-  s[11] = s4 >> 4;
-  s[12] = s4 >> 12;
-  s[13] = (s4 >> 20) | (s5 << 1);
-  s[14] = s5 >> 7;
-  s[15] = (s5 >> 15) | (s6 << 6);
-  s[16] = s6 >> 2;
-  s[17] = s6 >> 10;
-  s[18] = (s6 >> 18) | (s7 << 3);
-  s[19] = s7 >> 5;
-  s[20] = s7 >> 13;
-  s[21] = s8 >> 0;
-  s[22] = s8 >> 8;
-  s[23] = (s8 >> 16) | (s9 << 5);
-  s[24] = s9 >> 3;
-  s[25] = s9 >> 11;
-  s[26] = (s9 >> 19) | (s10 << 2);
-  s[27] = s10 >> 6;
-  s[28] = (s10 >> 14) | (s11 << 7);
-  s[29] = s11 >> 1;
-  s[30] = s11 >> 9;
-  s[31] = s11 >> 17;
-
-  /* hush warnings after setting values to 0 */
-  (void)s12;
-  (void)s13;
-  (void)s14;
-  (void)s15;
-  (void)s16;
-  (void)s17;
-  (void)s18;
-  (void)s19;
-  (void)s20;
-  (void)s21;
-  (void)s22;
-  (void)s23;
-}
-
-
-/*
-Input:
-  a[0]+256*a[1]+...+256^31*a[31] = a
-  b[0]+256*b[1]+...+256^31*b[31] = b
-  c[0]+256*c[1]+...+256^31*c[31] = c
-
-Output:
-  s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
-  where l = 2^252 + 27742317777372353535851937790883648493.
-*/
-
-void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
-{
-  int64_t a0 = 2097151 & load_3(a);
-  int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
-  int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
-  int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
-  int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
-  int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
-  int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
-  int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
-  int64_t a8 = 2097151 & load_3(a + 21);
-  int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
-  int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
-  int64_t a11 = (load_4(a + 28) >> 7);
-  int64_t b0 = 2097151 & load_3(b);
-  int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
-  int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
-  int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
-  int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
-  int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
-  int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
-  int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
-  int64_t b8 = 2097151 & load_3(b + 21);
-  int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
-  int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
-  int64_t b11 = (load_4(b + 28) >> 7);
-  int64_t c0 = 2097151 & load_3(c);
-  int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
-  int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
-  int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
-  int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
-  int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
-  int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
-  int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
-  int64_t c8 = 2097151 & load_3(c + 21);
-  int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
-  int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
-  int64_t c11 = (load_4(c + 28) >> 7);
-  int64_t s0;
-  int64_t s1;
-  int64_t s2;
-  int64_t s3;
-  int64_t s4;
-  int64_t s5;
-  int64_t s6;
-  int64_t s7;
-  int64_t s8;
-  int64_t s9;
-  int64_t s10;
-  int64_t s11;
-  int64_t s12;
-  int64_t s13;
-  int64_t s14;
-  int64_t s15;
-  int64_t s16;
-  int64_t s17;
-  int64_t s18;
-  int64_t s19;
-  int64_t s20;
-  int64_t s21;
-  int64_t s22;
-  int64_t s23;
-  int64_t carry0;
-  int64_t carry1;
-  int64_t carry2;
-  int64_t carry3;
-  int64_t carry4;
-  int64_t carry5;
-  int64_t carry6;
-  int64_t carry7;
-  int64_t carry8;
-  int64_t carry9;
-  int64_t carry10;
-  int64_t carry11;
-  int64_t carry12;
-  int64_t carry13;
-  int64_t carry14;
-  int64_t carry15;
-  int64_t carry16;
-  int64_t carry17;
-  int64_t carry18;
-  int64_t carry19;
-  int64_t carry20;
-  int64_t carry21;
-  int64_t carry22;
-
-  s0 = c0 + a0*b0;
-  s1 = c1 + a0*b1 + a1*b0;
-  s2 = c2 + a0*b2 + a1*b1 + a2*b0;
-  s3 = c3 + a0*b3 + a1*b2 + a2*b1 + a3*b0;
-  s4 = c4 + a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0;
-  s5 = c5 + a0*b5 + a1*b4 + a2*b3 + a3*b2 + a4*b1 + a5*b0;
-  s6 = c6 + a0*b6 + a1*b5 + a2*b4 + a3*b3 + a4*b2 + a5*b1 + a6*b0;
-  s7 = c7 + a0*b7 + a1*b6 + a2*b5 + a3*b4 + a4*b3 + a5*b2 + a6*b1 + a7*b0;
-  s8 = c8 + a0*b8 + a1*b7 + a2*b6 + a3*b5 + a4*b4 + a5*b3 + a6*b2 + a7*b1 + a8*b0;
-  s9 = c9 + a0*b9 + a1*b8 + a2*b7 + a3*b6 + a4*b5 + a5*b4 + a6*b3 + a7*b2 + a8*b1 + a9*b0;
-  s10 = c10 + a0*b10 + a1*b9 + a2*b8 + a3*b7 + a4*b6 + a5*b5 + a6*b4 + a7*b3 + a8*b2 + a9*b1 + a10*b0;
-  s11 = c11 + a0*b11 + a1*b10 + a2*b9 + a3*b8 + a4*b7 + a5*b6 + a6*b5 + a7*b4 + a8*b3 + a9*b2 + a10*b1 + a11*b0;
-  s12 = a1*b11 + a2*b10 + a3*b9 + a4*b8 + a5*b7 + a6*b6 + a7*b5 + a8*b4 + a9*b3 + a10*b2 + a11*b1;
-  s13 = a2*b11 + a3*b10 + a4*b9 + a5*b8 + a6*b7 + a7*b6 + a8*b5 + a9*b4 + a10*b3 + a11*b2;
-  s14 = a3*b11 + a4*b10 + a5*b9 + a6*b8 + a7*b7 + a8*b6 + a9*b5 + a10*b4 + a11*b3;
-  s15 = a4*b11 + a5*b10 + a6*b9 + a7*b8 + a8*b7 + a9*b6 + a10*b5 + a11*b4;
-  s16 = a5*b11 + a6*b10 + a7*b9 + a8*b8 + a9*b7 + a10*b6 + a11*b5;
-  s17 = a6*b11 + a7*b10 + a8*b9 + a9*b8 + a10*b7 + a11*b6;
-  s18 = a7*b11 + a8*b10 + a9*b9 + a10*b8 + a11*b7;
-  s19 = a8*b11 + a9*b10 + a10*b9 + a11*b8;
-  s20 = a9*b11 + a10*b10 + a11*b9;
-  s21 = a10*b11 + a11*b10;
-  s22 = a11*b11;
-  s23 = 0;
-
-  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
-  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
-  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
-  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
-  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
-  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
-  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
-  carry18 = (s18 + (1<<20)) >> 21; s19 += carry18; s18 -= carry18 << 21;
-  carry20 = (s20 + (1<<20)) >> 21; s21 += carry20; s20 -= carry20 << 21;
-  carry22 = (s22 + (1<<20)) >> 21; s23 += carry22; s22 -= carry22 << 21;
-
-  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
-  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
-  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
-  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
-  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
-  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
-  carry17 = (s17 + (1<<20)) >> 21; s18 += carry17; s17 -= carry17 << 21;
-  carry19 = (s19 + (1<<20)) >> 21; s20 += carry19; s19 -= carry19 << 21;
-  carry21 = (s21 + (1<<20)) >> 21; s22 += carry21; s21 -= carry21 << 21;
-
-  s11 += s23 * 666643;
-  s12 += s23 * 470296;
-  s13 += s23 * 654183;
-  s14 -= s23 * 997805;
-  s15 += s23 * 136657;
-  s16 -= s23 * 683901;
-  s23 = 0;
-
-  s10 += s22 * 666643;
-  s11 += s22 * 470296;
-  s12 += s22 * 654183;
-  s13 -= s22 * 997805;
-  s14 += s22 * 136657;
-  s15 -= s22 * 683901;
-  s22 = 0;
-
-  s9 += s21 * 666643;
-  s10 += s21 * 470296;
-  s11 += s21 * 654183;
-  s12 -= s21 * 997805;
-  s13 += s21 * 136657;
-  s14 -= s21 * 683901;
-  s21 = 0;
-
-  s8 += s20 * 666643;
-  s9 += s20 * 470296;
-  s10 += s20 * 654183;
-  s11 -= s20 * 997805;
-  s12 += s20 * 136657;
-  s13 -= s20 * 683901;
-  s20 = 0;
-
-  s7 += s19 * 666643;
-  s8 += s19 * 470296;
-  s9 += s19 * 654183;
-  s10 -= s19 * 997805;
-  s11 += s19 * 136657;
-  s12 -= s19 * 683901;
-  s19 = 0;
-
-  s6 += s18 * 666643;
-  s7 += s18 * 470296;
-  s8 += s18 * 654183;
-  s9 -= s18 * 997805;
-  s10 += s18 * 136657;
-  s11 -= s18 * 683901;
-  s18 = 0;
-
-  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
-  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
-  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
-  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
-
-  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
-  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
-  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
-
-  s5 += s17 * 666643;
-  s6 += s17 * 470296;
-  s7 += s17 * 654183;
-  s8 -= s17 * 997805;
-  s9 += s17 * 136657;
-  s10 -= s17 * 683901;
-  s17 = 0;
-
-  s4 += s16 * 666643;
-  s5 += s16 * 470296;
-  s6 += s16 * 654183;
-  s7 -= s16 * 997805;
-  s8 += s16 * 136657;
-  s9 -= s16 * 683901;
-  s16 = 0;
-
-  s3 += s15 * 666643;
-  s4 += s15 * 470296;
-  s5 += s15 * 654183;
-  s6 -= s15 * 997805;
-  s7 += s15 * 136657;
-  s8 -= s15 * 683901;
-  s15 = 0;
-
-  s2 += s14 * 666643;
-  s3 += s14 * 470296;
-  s4 += s14 * 654183;
-  s5 -= s14 * 997805;
-  s6 += s14 * 136657;
-  s7 -= s14 * 683901;
-  s14 = 0;
-
-  s1 += s13 * 666643;
-  s2 += s13 * 470296;
-  s3 += s13 * 654183;
-  s4 -= s13 * 997805;
-  s5 += s13 * 136657;
-  s6 -= s13 * 683901;
-  s13 = 0;
-
-  s0 += s12 * 666643;
-  s1 += s12 * 470296;
-  s2 += s12 * 654183;
-  s3 -= s12 * 997805;
-  s4 += s12 * 136657;
-  s5 -= s12 * 683901;
-  s12 = 0;
-
-  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
-  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
-  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
-  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
-
-  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
-  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
-  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
-  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
-
-  s0 += s12 * 666643;
-  s1 += s12 * 470296;
-  s2 += s12 * 654183;
-  s3 -= s12 * 997805;
-  s4 += s12 * 136657;
-  s5 -= s12 * 683901;
-  s12 = 0;
-
-  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
-  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
-  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
-  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
-  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
-  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
-  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
-  carry11 = s11 >> 21; s12 += carry11; s11 -= carry11 << 21;
-
-  s0 += s12 * 666643;
-  s1 += s12 * 470296;
-  s2 += s12 * 654183;
-  s3 -= s12 * 997805;
-  s4 += s12 * 136657;
-  s5 -= s12 * 683901;
-  s12 = 0;
-
-  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
-  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
-  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
-  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
-  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
-  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
-  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
-  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
-  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
-  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
-  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
-
-  s[0] = s0 >> 0;
-  s[1] = s0 >> 8;
-  s[2] = (s0 >> 16) | (s1 << 5);
-  s[3] = s1 >> 3;
-  s[4] = s1 >> 11;
-  s[5] = (s1 >> 19) | (s2 << 2);
-  s[6] = s2 >> 6;
-  s[7] = (s2 >> 14) | (s3 << 7);
-  s[8] = s3 >> 1;
-  s[9] = s3 >> 9;
-  s[10] = (s3 >> 17) | (s4 << 4);
-  s[11] = s4 >> 4;
-  s[12] = s4 >> 12;
-  s[13] = (s4 >> 20) | (s5 << 1);
-  s[14] = s5 >> 7;
-  s[15] = (s5 >> 15) | (s6 << 6);
-  s[16] = s6 >> 2;
-  s[17] = s6 >> 10;
-  s[18] = (s6 >> 18) | (s7 << 3);
-  s[19] = s7 >> 5;
-  s[20] = s7 >> 13;
-  s[21] = s8 >> 0;
-  s[22] = s8 >> 8;
-  s[23] = (s8 >> 16) | (s9 << 5);
-  s[24] = s9 >> 3;
-  s[25] = s9 >> 11;
-  s[26] = (s9 >> 19) | (s10 << 2);
-  s[27] = s10 >> 6;
-  s[28] = (s10 >> 14) | (s11 << 7);
-  s[29] = s11 >> 1;
-  s[30] = s11 >> 9;
-  s[31] = s11 >> 17;
-
-  /* hush warnings after setting values to 0 */
-  (void)s12;
-  (void)s13;
-  (void)s14;
-  (void)s15;
-  (void)s16;
-  (void)s17;
-  (void)s18;
-  (void)s19;
-  (void)s20;
-  (void)s21;
-  (void)s22;
-  (void)s23;
-}
-
-
 /*
     generate an ed25519 key pair.
     returns 0 on success
@@ -676,8 +46,8 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
 int wc_ed25519_make_key(RNG* rng, int keySz, ed25519_key* key)
 {
     byte  az[64];
-    ge_p3 A;
     int   ret;
+    ge_p3 A;
 
     if (rng == NULL || key == NULL)
         return BAD_FUNC_ARG;
@@ -713,13 +83,13 @@ int wc_ed25519_make_key(RNG* rng, int keySz, ed25519_key* key)
 int wc_ed25519_sign_msg(const byte* in, word32 inlen, byte* out,
                         word32 *outlen, ed25519_key* key)
 {
-    int    ret = 0;
-    byte   nonce[64];
-    byte   hram[64];
+    ge_p3  R;
+    byte   nonce[SHA512_DIGEST_SIZE];
+    byte   hram[SHA512_DIGEST_SIZE];
     byte   az[64];
     word32 sigSz;
-    ge_p3  R;
     Sha512 sha;
+    int    ret = 0;
 
     /* sanity check on arguments */
     if (in == NULL || out == NULL || outlen == NULL || key == NULL)
@@ -732,7 +102,8 @@ int wc_ed25519_sign_msg(const byte* in, word32 inlen, byte* out,
         return BAD_FUNC_ARG;
     *outlen = sigSz;
 
-    /* create nonce to use */
+    /* step 1: create nonce to use where nonce is r in
+       r = H(h_b, ... ,h_2b-1,M) */
     ret |= wc_Sha512Hash(key->k,32,az);
     az[0]  &= 248;
     az[31] &= 63;
@@ -742,10 +113,14 @@ int wc_ed25519_sign_msg(const byte* in, word32 inlen, byte* out,
     ret |= wc_Sha512Update(&sha, in, inlen);
     ret |= wc_Sha512Final(&sha, nonce);
     sc_reduce(nonce);
+
+    /* step 2: computing R = rB where rB is the scalar multiplication of
+       r and B */
     ge_scalarmult_base(&R,nonce);
     ge_p3_tobytes(out,&R);
 
-    /* hash scalarmult of nonce + public key + message */
+    /* step 3: hash R + public key + message getting H(R,A,M) then
+       creating S = (r + H(R,A,M)a) mod l */
     ret |= wc_InitSha512(&sha);
     ret |= wc_Sha512Update(&sha, out, 32);
     ret |= wc_Sha512Update(&sha, key->p, 32);
@@ -768,13 +143,13 @@ int wc_ed25519_sign_msg(const byte* in, word32 inlen, byte* out,
 int wc_ed25519_verify_msg(byte* sig, word32 siglen, const byte* msg,
                           word32 msglen, int* stat, ed25519_key* key)
 {
-    int    ret;
-    word32 sigSz;
-    byte   h[64];
     byte   rcheck[32];
-    Sha512 sha;
+    byte   h[SHA512_DIGEST_SIZE];
     ge_p3  A;
     ge_p2  R;
+    word32 sigSz;
+    int    ret;
+    Sha512 sha;
 
     /* sanity check on arguments */
     if (sig == NULL || msg == NULL || stat == NULL || key == NULL)
@@ -789,10 +164,12 @@ int wc_ed25519_verify_msg(byte* sig, word32 siglen, const byte* msg,
         return BAD_FUNC_ARG;
     if (sig[63] & 224)
         return BAD_FUNC_ARG;
+
+    /* uncompress A (public key), test if valid, and negate it */
     if (ge_frombytes_negate_vartime(&A, key->p) != 0)
         return BAD_FUNC_ARG;
 
-    /* reduce hash of r + public key + message */
+    /* find H(R,A,M) and store it as h */
     ret |= wc_InitSha512(&sha);
     ret |= wc_Sha512Update(&sha, sig,    32);
     ret |= wc_Sha512Update(&sha, key->p, 32);
@@ -800,12 +177,16 @@ int wc_ed25519_verify_msg(byte* sig, word32 siglen, const byte* msg,
     ret |= wc_Sha512Final(&sha,  h);
     sc_reduce(h);
 
-    /* scalarmult placed in R using hash + A + s */
-    ge_double_scalarmult_vartime(&R, h, &A, sig + 32);
-    ge_tobytes(rcheck,&R);
+    /*
+       Uses a fast single-signature verification SB = R + H(R,A,M)A becomes
+       SB - H(R,A,M)A saving decompression of R
+    */
+    ret |= ge_double_scalarmult_vartime(&R, h, &A, sig + 32);
+    ge_tobytes(rcheck, &R);
 
-    /* comparison of r created to r in sig */
+    /* comparison of R created to R in sig */
     ret  |= ConstantCompare(rcheck, sig, 32);
+
     *stat = (ret == 0)? 1: 0;
 
     return ret;
@@ -843,6 +224,7 @@ int wc_ed25519_export_public(ed25519_key* key, byte* out, word32* outLen)
 {
     word32 keySz;
 
+    /* sanity check on arguments */
     if (key == NULL || out == NULL || outLen == NULL)
         return BAD_FUNC_ARG;
 
@@ -858,35 +240,6 @@ int wc_ed25519_export_public(ed25519_key* key, byte* out, word32* outLen)
 }
 
 
-/* internal function for importing uncompressed public keys */
-static int compress_key(byte* out, const byte* xIn, const byte* yIn,
-                        word32 keySz)
-{
-    fe     x,y,z;
-    ge_p3  g;
-    byte   bArray[ED25519_KEY_SIZE];
-    word32 i;
-
-    fe_0(x);
-    fe_0(y);
-    fe_1(z);
-    fe_frombytes(x, xIn);
-    fe_frombytes(y, yIn);
-
-    fe_copy(g.X, x);
-    fe_copy(g.Y, y);
-    fe_copy(g.Z, z);
-
-    ge_p3_tobytes(bArray, &g);
-
-    for (i = 0; i < keySz; i++) {
-        out[keySz - 1 - i] = bArray[i];
-    }
-
-    return 0;
-}
-
-
 /*
     Imports a compressed/uncompressed public key.
     in    the byte array containing the public key
@@ -898,6 +251,7 @@ int wc_ed25519_import_public(const byte* in, word32 inLen, ed25519_key* key)
     word32 keySz;
     int    ret;
 
+    /* sanity check on arguments */
     if (in == NULL || key == NULL)
         return BAD_FUNC_ARG;
 
@@ -917,7 +271,7 @@ int wc_ed25519_import_public(const byte* in, word32 inLen, ed25519_key* key)
     /* importing uncompressed public key */
     if (in[0] == 0x04) {
         /* pass in (x,y) and store compressed key */
-        ret = compress_key(key->p, (in+1), (in+1+keySz), keySz);
+        ret = ge_compress_key(key->p, (in+1), (in+1+keySz), keySz);
         return ret;
     }
 
@@ -942,6 +296,7 @@ int wc_ed25519_import_private_key(const byte* priv, word32 privSz,
     word32 keySz;
     int    ret;
 
+    /* sanity check on arguments */
     if (priv == NULL || pub == NULL || key == NULL)
         return BAD_FUNC_ARG;
 
diff --git a/wolfcrypt/src/fe_low_mem.c b/wolfcrypt/src/fe_low_mem.c
new file mode 100644
index 00000000000..80834f54ce8
--- /dev/null
+++ b/wolfcrypt/src/fe_low_mem.c
@@ -0,0 +1,596 @@
+/* fe_low_mem.c
+ *
+ * Copyright (C) 2006-2015 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL. (formerly known as CyaSSL)
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+/* Based from Daniel Beer's public domain word. */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#if defined(HAVE_ED25519) || defined(HAVE_CURVE25519)
+
+#include <wolfssl/wolfcrypt/fe_operations.h>
+
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+
+void fprime_copy(byte *x, const byte *a)
+{
+    int i;
+    for (i = 0; i < F25519_SIZE; i++)
+        x[i] = a[i];
+}
+
+
+void fe_copy(fe x, const fe a)
+{
+    int i;
+    for (i = 0; i < F25519_SIZE; i++)
+        x[i] = a[i];
+}
+
+
+/* Double an X-coordinate */
+static void xc_double(byte *x3, byte *z3,
+		      const byte *x1, const byte *z1)
+{
+	/* Explicit formulas database: dbl-1987-m
+	 *
+	 * source 1987 Montgomery "Speeding the Pollard and elliptic
+	 *   curve methods of factorization", page 261, fourth display
+	 * compute X3 = (X1^2-Z1^2)^2
+	 * compute Z3 = 4 X1 Z1 (X1^2 + a X1 Z1 + Z1^2)
+	 */
+	byte x1sq[F25519_SIZE];
+	byte z1sq[F25519_SIZE];
+	byte x1z1[F25519_SIZE];
+	byte a[F25519_SIZE];
+
+	fe_mul__distinct(x1sq, x1, x1);
+	fe_mul__distinct(z1sq, z1, z1);
+	fe_mul__distinct(x1z1, x1, z1);
+
+	fe_sub(a, x1sq, z1sq);
+	fe_mul__distinct(x3, a, a);
+
+	fe_mul_c(a, x1z1, 486662);
+	fe_add(a, x1sq, a);
+	fe_add(a, z1sq, a);
+	fe_mul__distinct(x1sq, x1z1, a);
+	fe_mul_c(z3, x1sq, 4);
+}
+
+
+/* Differential addition */
+static void xc_diffadd(byte *x5, byte *z5,
+		       const byte *x1, const byte *z1,
+		       const byte *x2, const byte *z2,
+		       const byte *x3, const byte *z3)
+{
+	/* Explicit formulas database: dbl-1987-m3
+	 *
+	 * source 1987 Montgomery "Speeding the Pollard and elliptic curve
+	 *   methods of factorization", page 261, fifth display, plus
+	 *   common-subexpression elimination
+	 * compute A = X2+Z2
+	 * compute B = X2-Z2
+	 * compute C = X3+Z3
+	 * compute D = X3-Z3
+	 * compute DA = D A
+	 * compute CB = C B
+	 * compute X5 = Z1(DA+CB)^2
+	 * compute Z5 = X1(DA-CB)^2
+	 */
+	byte da[F25519_SIZE];
+	byte cb[F25519_SIZE];
+	byte a[F25519_SIZE];
+	byte b[F25519_SIZE];
+
+	fe_add(a, x2, z2);
+	fe_sub(b, x3, z3); /* D */
+	fe_mul__distinct(da, a, b);
+
+	fe_sub(b, x2, z2);
+	fe_add(a, x3, z3); /* C */
+	fe_mul__distinct(cb, a, b);
+
+	fe_add(a, da, cb);
+	fe_mul__distinct(b, a, a);
+	fe_mul__distinct(x5, z1, b);
+
+	fe_sub(a, da, cb);
+	fe_mul__distinct(b, a, a);
+	fe_mul__distinct(z5, x1, b);
+}
+
+
+int curve25519(byte *result, byte *e, byte *q)
+{
+	/* Current point: P_m */
+	byte xm[F25519_SIZE];
+	byte zm[F25519_SIZE] = {1};
+
+	/* Predecessor: P_(m-1) */
+	byte xm1[F25519_SIZE] = {1};
+	byte zm1[F25519_SIZE] = {0};
+
+	int i;
+
+	/* Note: bit 254 is assumed to be 1 */
+	fe_copy(xm, q);
+
+	for (i = 253; i >= 0; i--) {
+		const int bit = (e[i >> 3] >> (i & 7)) & 1;
+		byte xms[F25519_SIZE];
+		byte zms[F25519_SIZE];
+
+		/* From P_m and P_(m-1), compute P_(2m) and P_(2m-1) */
+		xc_diffadd(xm1, zm1, q, f25519_one, xm, zm, xm1, zm1);
+		xc_double(xm, zm, xm, zm);
+
+		/* Compute P_(2m+1) */
+		xc_diffadd(xms, zms, xm1, zm1, xm, zm, q, f25519_one);
+
+		/* Select:
+		 *   bit = 1 --> (P_(2m+1), P_(2m))
+		 *   bit = 0 --> (P_(2m), P_(2m-1))
+		 */
+		fe_select(xm1, xm1, xm, bit);
+		fe_select(zm1, zm1, zm, bit);
+		fe_select(xm, xm, xms, bit);
+		fe_select(zm, zm, zms, bit);
+	}
+
+	/* Freeze out of projective coordinates */
+	fe_inv__distinct(zm1, zm);
+	fe_mul__distinct(result, zm1, xm);
+	fe_normalize(result);
+    return 0;
+}
+
+
+static void raw_add(byte *x, const byte *p)
+{
+	word16 c = 0;
+	int i;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c += ((word16)x[i]) + ((word16)p[i]);
+		x[i] = c;
+		c >>= 8;
+	}
+}
+
+
+static void raw_try_sub(byte *x, const byte *p)
+{
+	byte minusp[F25519_SIZE];
+	word16 c = 0;
+	int i;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c = ((word16)x[i]) - ((word16)p[i]) - c;
+		minusp[i] = c;
+		c = (c >> 8) & 1;
+	}
+
+	fprime_select(x, minusp, x, c);
+}
+
+
+static int prime_msb(const byte *p)
+{
+    int i;
+    byte x;
+    int shift = 1;
+    int z     = F25519_SIZE - 1;
+
+   /*
+       Test for any hot bits.
+       As soon as one instance is incountered set shift to 0.
+    */
+	for (i = F25519_SIZE - 1; i >= 0; i--) {
+        shift &= ((shift ^ ((-p[i] | p[i]) >> 7)) & 1);
+        z -= shift;
+    }
+	x = p[z];
+	z <<= 3;
+    shift = 1;
+    for (i = 0; i < 8; i++) {
+        shift &= ((-(x >> i) | (x >> i)) >> (7 - i) & 1);
+        z += shift;
+    }
+
+	return z - 1;
+}
+
+
+void fprime_select(byte *dst, const byte *zero, const byte *one, byte condition)
+{
+	const byte mask = -condition;
+	int i;
+
+	for (i = 0; i < F25519_SIZE; i++)
+		dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
+}
+
+
+void fprime_add(byte *r, const byte *a, const byte *modulus)
+{
+	raw_add(r, a);
+	raw_try_sub(r, modulus);
+}
+
+
+void fprime_sub(byte *r, const byte *a, const byte *modulus)
+{
+	raw_add(r, modulus);
+	raw_try_sub(r, a);
+	raw_try_sub(r, modulus);
+}
+
+
+void fprime_mul(byte *r, const byte *a, const byte *b,
+		const byte *modulus)
+{
+	word16 c = 0;
+	int i,j;
+
+	XMEMSET(r, 0, F25519_SIZE);
+
+	for (i = prime_msb(modulus); i >= 0; i--) {
+		const byte bit = (b[i >> 3] >> (i & 7)) & 1;
+		byte plusa[F25519_SIZE];
+
+	    for (j = 0; j < F25519_SIZE; j++) {
+		    c |= ((word16)r[j]) << 1;
+		    r[j] = c;
+		    c >>= 8;
+	    }
+		raw_try_sub(r, modulus);
+
+		fprime_copy(plusa, r);
+		fprime_add(plusa, a, modulus);
+
+		fprime_select(r, r, plusa, bit);
+	}
+}
+
+
+void fe_load(byte *x, word32 c)
+{
+	word32 i;
+
+	for (i = 0; i < sizeof(c); i++) {
+		x[i] = c;
+		c >>= 8;
+	}
+
+	for (; i < F25519_SIZE; i++)
+		x[i] = 0;
+}
+
+
+void fe_normalize(byte *x)
+{
+	byte minusp[F25519_SIZE];
+	word16 c;
+	int i;
+
+	/* Reduce using 2^255 = 19 mod p */
+	c = (x[31] >> 7) * 19;
+	x[31] &= 127;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c += x[i];
+		x[i] = c;
+		c >>= 8;
+	}
+
+	/* The number is now less than 2^255 + 18, and therefore less than
+	 * 2p. Try subtracting p, and conditionally load the subtracted
+	 * value if underflow did not occur.
+	 */
+	c = 19;
+
+	for (i = 0; i + 1 < F25519_SIZE; i++) {
+		c += x[i];
+		minusp[i] = c;
+		c >>= 8;
+	}
+
+	c += ((word16)x[i]) - 128;
+	minusp[31] = c;
+
+	/* Load x-p if no underflow */
+	fe_select(x, minusp, x, (c >> 15) & 1);
+}
+
+
+void fe_select(byte *dst,
+		   const byte *zero, const byte *one,
+		   byte condition)
+{
+	const byte mask = -condition;
+	int i;
+
+	for (i = 0; i < F25519_SIZE; i++)
+		dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
+}
+
+
+void fe_add(fe r, const fe a, const fe b)
+{
+	word16 c = 0;
+	int i;
+
+	/* Add */
+	for (i = 0; i < F25519_SIZE; i++) {
+		c >>= 8;
+		c += ((word16)a[i]) + ((word16)b[i]);
+		r[i] = c;
+	}
+
+	/* Reduce with 2^255 = 19 mod p */
+	r[31] &= 127;
+	c = (c >> 7) * 19;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c += r[i];
+		r[i] = c;
+		c >>= 8;
+	}
+}
+
+
+void fe_sub(fe r, const fe a, const fe b)
+{
+	word32 c = 0;
+	int i;
+
+	/* Calculate a + 2p - b, to avoid underflow */
+	c = 218;
+	for (i = 0; i + 1 < F25519_SIZE; i++) {
+		c += 65280 + ((word32)a[i]) - ((word32)b[i]);
+		r[i] = c;
+		c >>= 8;
+	}
+
+	c += ((word32)a[31]) - ((word32)b[31]);
+	r[31] = c & 127;
+	c = (c >> 7) * 19;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c += r[i];
+		r[i] = c;
+		c >>= 8;
+	}
+}
+
+
+void fe_neg(fe r, const fe a)
+{
+	word32 c = 0;
+	int i;
+
+	/* Calculate 2p - a, to avoid underflow */
+	c = 218;
+	for (i = 0; i + 1 < F25519_SIZE; i++) {
+		c += 65280 - ((word32)a[i]);
+		r[i] = c;
+		c >>= 8;
+	}
+
+	c -= ((word32)a[31]);
+	r[31] = c & 127;
+	c = (c >> 7) * 19;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c += r[i];
+		r[i] = c;
+		c >>= 8;
+	}
+}
+
+
+void fe_mul__distinct(byte *r, const byte *a, const byte *b)
+{
+	word32 c = 0;
+	int i;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		int j;
+
+		c >>= 8;
+		for (j = 0; j <= i; j++)
+			c += ((word32)a[j]) * ((word32)b[i - j]);
+
+		for (; j < F25519_SIZE; j++)
+			c += ((word32)a[j]) *
+			     ((word32)b[i + F25519_SIZE - j]) * 38;
+
+		r[i] = c;
+	}
+
+	r[31] &= 127;
+	c = (c >> 7) * 19;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c += r[i];
+		r[i] = c;
+		c >>= 8;
+	}
+}
+
+
+void fe_mul(fe r, const fe a, const fe b)
+{
+	byte tmp[F25519_SIZE];
+
+	fe_mul__distinct(tmp, a, b);
+	fe_copy(r, tmp);
+}
+
+
+void fe_mul_c(byte *r, const byte *a, word32 b)
+{
+	word32 c = 0;
+	int i;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c >>= 8;
+		c += b * ((word32)a[i]);
+		r[i] = c;
+	}
+
+	r[31] &= 127;
+	c >>= 7;
+	c *= 19;
+
+	for (i = 0; i < F25519_SIZE; i++) {
+		c += r[i];
+		r[i] = c;
+		c >>= 8;
+	}
+}
+
+
+void fe_inv__distinct(byte *r, const byte *x)
+{
+	byte s[F25519_SIZE];
+	int i;
+
+	/* This is a prime field, so by Fermat's little theorem:
+	 *
+	 *     x^(p-1) = 1 mod p
+	 *
+	 * Therefore, raise to (p-2) = 2^255-21 to get a multiplicative
+	 * inverse.
+	 *
+	 * This is a 255-bit binary number with the digits:
+	 *
+	 *     11111111... 01011
+	 *
+	 * We compute the result by the usual binary chain, but
+	 * alternate between keeping the accumulator in r and s, so as
+	 * to avoid copying temporaries.
+	 */
+
+	/* 1 1 */
+	fe_mul__distinct(s, x, x);
+	fe_mul__distinct(r, s, x);
+
+	/* 1 x 248 */
+	for (i = 0; i < 248; i++) {
+		fe_mul__distinct(s, r, r);
+		fe_mul__distinct(r, s, x);
+	}
+
+	/* 0 */
+	fe_mul__distinct(s, r, r);
+
+	/* 1 */
+	fe_mul__distinct(r, s, s);
+	fe_mul__distinct(s, r, x);
+
+	/* 0 */
+	fe_mul__distinct(r, s, s);
+
+	/* 1 */
+	fe_mul__distinct(s, r, r);
+	fe_mul__distinct(r, s, x);
+
+	/* 1 */
+	fe_mul__distinct(s, r, r);
+	fe_mul__distinct(r, s, x);
+}
+
+
+void fe_invert(fe r, const fe x)
+{
+	byte tmp[F25519_SIZE];
+
+	fe_inv__distinct(tmp, x);
+	fe_copy(r, tmp);
+}
+
+
+/* Raise x to the power of (p-5)/8 = 2^252-3, using s for temporary
+ * storage.
+ */
+static void exp2523(byte *r, const byte *x, byte *s)
+{
+	int i;
+
+	/* This number is a 252-bit number with the binary expansion:
+	 *
+	 *     111111... 01
+	 */
+
+	/* 1 1 */
+	fe_mul__distinct(r, x, x);
+	fe_mul__distinct(s, r, x);
+
+	/* 1 x 248 */
+	for (i = 0; i < 248; i++) {
+		fe_mul__distinct(r, s, s);
+		fe_mul__distinct(s, r, x);
+	}
+
+	/* 0 */
+	fe_mul__distinct(r, s, s);
+
+	/* 1 */
+	fe_mul__distinct(s, r, r);
+	fe_mul__distinct(r, s, x);
+}
+
+
+void fe_sqrt(byte *r, const byte *a)
+{
+	byte v[F25519_SIZE];
+	byte i[F25519_SIZE];
+	byte x[F25519_SIZE];
+	byte y[F25519_SIZE];
+
+	/* v = (2a)^((p-5)/8) [x = 2a] */
+	fe_mul_c(x, a, 2);
+	exp2523(v, x, y);
+
+	/* i = 2av^2 - 1 */
+	fe_mul__distinct(y, v, v);
+	fe_mul__distinct(i, x, y);
+	fe_load(y, 1);
+	fe_sub(i, i, y);
+
+	/* r = avi */
+	fe_mul__distinct(x, v, a);
+	fe_mul__distinct(r, x, i);
+}
+
+#endif /* HAVE_CURVE25519 or HAVE_ED25519 */
+
diff --git a/wolfcrypt/src/fe_operations.c b/wolfcrypt/src/fe_operations.c
index 5d50517cc68..da07c951c11 100644
--- a/wolfcrypt/src/fe_operations.c
+++ b/wolfcrypt/src/fe_operations.c
@@ -105,6 +105,72 @@ void fe_0(fe h)
 }
 
 
+int curve25519(byte* q, byte* n, byte* p)
+{
+  unsigned char e[32];
+  unsigned int i;
+  fe x1;
+  fe x2;
+  fe z2;
+  fe x3;
+  fe z3;
+  fe tmp0;
+  fe tmp1;
+  int pos;
+  unsigned int swap;
+  unsigned int b;
+
+  for (i = 0;i < 32;++i) e[i] = n[i];
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+
+  fe_frombytes(x1,p);
+  fe_1(x2);
+  fe_0(z2);
+  fe_copy(x3,x1);
+  fe_1(z3);
+
+  swap = 0;
+  for (pos = 254;pos >= 0;--pos) {
+    b = e[pos / 8] >> (pos & 7);
+    b &= 1;
+    swap ^= b;
+    fe_cswap(x2,x3,swap);
+    fe_cswap(z2,z3,swap);
+    swap = b;
+
+    /* montgomery */
+	fe_sub(tmp0,x3,z3);
+	fe_sub(tmp1,x2,z2);
+	fe_add(x2,x2,z2);
+	fe_add(z2,x3,z3);
+	fe_mul(z3,tmp0,x2);
+	fe_mul(z2,z2,tmp1);
+	fe_sq(tmp0,tmp1);
+	fe_sq(tmp1,x2);
+	fe_add(x3,z3,z2);
+	fe_sub(z2,z3,z2);
+	fe_mul(x2,tmp1,tmp0);
+	fe_sub(tmp1,tmp1,tmp0);
+	fe_sq(z2,z2);
+	fe_mul121666(z3,tmp1);
+	fe_sq(x3,x3);
+	fe_add(tmp0,tmp0,z3);
+	fe_mul(z3,x1,z2);
+	fe_mul(z2,tmp1,tmp0);
+  }
+  fe_cswap(x2,x3,swap);
+  fe_cswap(z2,z3,swap);
+
+  fe_invert(z2,z2);
+  fe_mul(x2,x2,z2);
+  fe_tobytes(q,x2);
+
+  return 0;
+}
+
+
 /*
 h = f * f
 Can overlap h with f.
@@ -1236,9 +1302,6 @@ void fe_neg(fe h,const fe f)
 
 
 /*
-return 1 if f == 0
-return 0 if f != 0
-
 Preconditions:
    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 */
diff --git a/wolfcrypt/src/ge_low_mem.c b/wolfcrypt/src/ge_low_mem.c
new file mode 100644
index 00000000000..43c533c695b
--- /dev/null
+++ b/wolfcrypt/src/ge_low_mem.c
@@ -0,0 +1,558 @@
+/* ge_low_mem.c
+ *
+ * Copyright (C) 2006-2015 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL. (formerly known as CyaSSL)
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+ /* Based from Daniel Beer's public domain work. */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef HAVE_ED25519
+
+#include <wolfssl/wolfcrypt/ge_operations.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+void ed25519_smult(ge_p3 *r, const ge_p3 *a, const byte *e);
+void ed25519_add(ge_p3 *r, const ge_p3 *a, const ge_p3 *b);
+void ed25519_double(ge_p3 *r, const ge_p3 *a);
+
+
+static const byte ed25519_order[F25519_SIZE] = {
+	0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
+	0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10
+};
+
+/*Arithmetic modulo the group order m = 2^252 +
+ 27742317777372353535851937790883648493 =
+ 7237005577332262213973186563042994240857116359379907606001950938285454250989 */
+
+static const word32 m[32] = {
+    0xED,0xD3,0xF5,0x5C,0x1A,0x63,0x12,0x58,0xD6,0x9C,0xF7,0xA2,0xDE,0xF9,
+    0xDE,0x14,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x00,0x00,0x10
+};
+
+static const word32 mu[33] = {
+    0x1B,0x13,0x2C,0x0A,0xA3,0xE5,0x9C,0xED,0xA7,0x29,0x63,0x08,0x5D,0x21,
+    0x06,0x21,0xEB,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    0xFF,0xFF,0xFF,0xFF,0x0F
+};
+
+
+int ge_compress_key(byte* out, const byte* xIn, const byte* yIn,
+                        word32 keySz)
+{
+	byte tmp[F25519_SIZE];
+	byte parity;
+    int     i;
+
+	fe_copy(tmp, xIn);
+	parity = (tmp[0] & 1) << 7;
+
+    byte pt[32];
+	fe_copy(pt, yIn);
+	pt[31] |= parity;
+
+    for(i = 0; i < 32; i++) {
+        out[32-i-1] = pt[i];
+    }
+    (void)keySz;
+    return 0;
+}
+
+
+static word32 lt(word32 a,word32 b) /* 16-bit inputs */
+{
+  unsigned int x = a;
+  x -= (unsigned int) b; /* 0..65535: no; 4294901761..4294967295: yes */
+  x >>= 31; /* 0: no; 1: yes */
+  return x;
+}
+
+
+/* Reduce coefficients of r before calling reduce_add_sub */
+static void reduce_add_sub(word32 *r)
+{
+  word32 pb = 0;
+  word32 b;
+  word32 mask;
+  int i;
+  unsigned char t[32];
+
+  for(i=0;i<32;i++)
+  {
+    pb += m[i];
+    b = lt(r[i],pb);
+    t[i] = r[i]-pb+(b<<8);
+    pb = b;
+  }
+  mask = b - 1;
+  for(i=0;i<32;i++)
+    r[i] ^= mask & (r[i] ^ t[i]);
+}
+
+
+/* Reduce coefficients of x before calling barrett_reduce */
+static void barrett_reduce(word32* r, word32 x[64])
+{
+  /* See HAC, Alg. 14.42 */
+  int i,j;
+  word32 q2[66];
+  word32 *q3 = q2 + 33;
+  word32 r1[33];
+  word32 r2[33];
+  word32 carry;
+  word32 pb = 0;
+  word32 b;
+
+  for (i = 0;i < 66;++i) q2[i] = 0;
+  for (i = 0;i < 33;++i) r2[i] = 0;
+
+  for(i=0;i<33;i++)
+    for(j=0;j<33;j++)
+      if(i+j >= 31) q2[i+j] += mu[i]*x[j+31];
+  carry = q2[31] >> 8;
+  q2[32] += carry;
+  carry = q2[32] >> 8;
+  q2[33] += carry;
+
+  for(i=0;i<33;i++)r1[i] = x[i];
+  for(i=0;i<32;i++)
+    for(j=0;j<33;j++)
+      if(i+j < 33) r2[i+j] += m[i]*q3[j];
+
+  for(i=0;i<32;i++)
+  {
+    carry = r2[i] >> 8;
+    r2[i+1] += carry;
+    r2[i] &= 0xff;
+  }
+
+  for(i=0;i<32;i++)
+  {
+    pb += r2[i];
+    b = lt(r1[i],pb);
+    r[i] = r1[i]-pb+(b<<8);
+    pb = b;
+  }
+
+  /* XXX: Can it really happen that r<0?, See HAC, Alg 14.42, Step 3
+   * r is an unsigned type.
+   * If so: Handle  it here!
+   */
+
+  reduce_add_sub(r);
+  reduce_add_sub(r);
+}
+
+
+void sc_reduce(unsigned char x[64])
+{
+  int i;
+  word32 t[64];
+  word32 r[32];
+  for(i=0;i<64;i++) t[i] = x[i];
+  barrett_reduce(r, t);
+  for(i=0;i<32;i++) x[i] = (r[i] & 0xFF);
+}
+
+
+void sc_muladd(byte* out, const byte* a, const byte* b, const byte* c)
+{
+
+	byte s[32];
+    byte e[64];
+
+    XMEMSET(e, 0, sizeof(e));
+    XMEMCPY(e, b, 32);
+
+	/* Obtain e */
+	sc_reduce(e);
+
+	/* Compute s = ze + k */
+	fprime_mul(s, a, e, ed25519_order);
+	fprime_add(s, c, ed25519_order);
+
+	XMEMCPY(out, s, 32);
+}
+
+
+/* Base point is (numbers wrapped):
+ *
+ *     x = 151122213495354007725011514095885315114
+ *         54012693041857206046113283949847762202
+ *     y = 463168356949264781694283940034751631413
+ *         07993866256225615783033603165251855960
+ *
+ * y is derived by transforming the original Montgomery base (u=9). x
+ * is the corresponding positive coordinate for the new curve equation.
+ * t is x*y.
+ */
+const ge_p3 ed25519_base = {
+	.X = {
+		0x1a, 0xd5, 0x25, 0x8f, 0x60, 0x2d, 0x56, 0xc9,
+		0xb2, 0xa7, 0x25, 0x95, 0x60, 0xc7, 0x2c, 0x69,
+		0x5c, 0xdc, 0xd6, 0xfd, 0x31, 0xe2, 0xa4, 0xc0,
+		0xfe, 0x53, 0x6e, 0xcd, 0xd3, 0x36, 0x69, 0x21
+	},
+	.Y = {
+		0x58, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+		0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+		0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
+		0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66
+	},
+	.T = {
+		0xa3, 0xdd, 0xb7, 0xa5, 0xb3, 0x8a, 0xde, 0x6d,
+		0xf5, 0x52, 0x51, 0x77, 0x80, 0x9f, 0xf0, 0x20,
+		0x7d, 0xe3, 0xab, 0x64, 0x8e, 0x4e, 0xea, 0x66,
+		0x65, 0x76, 0x8b, 0xd7, 0x0f, 0x5f, 0x87, 0x67
+	},
+	.Z = {1, 0}
+};
+
+
+const ge_p3 ed25519_neutral = {
+	.X = {0},
+	.Y = {1, 0},
+	.T = {0},
+	.Z = {1, 0}
+};
+
+
+static const byte ed25519_d[F25519_SIZE] = {
+	0xa3, 0x78, 0x59, 0x13, 0xca, 0x4d, 0xeb, 0x75,
+	0xab, 0xd8, 0x41, 0x41, 0x4d, 0x0a, 0x70, 0x00,
+	0x98, 0xe8, 0x79, 0x77, 0x79, 0x40, 0xc7, 0x8c,
+	0x73, 0xfe, 0x6f, 0x2b, 0xee, 0x6c, 0x03, 0x52
+};
+
+
+/* k = 2d */
+static const byte ed25519_k[F25519_SIZE] = {
+	0x59, 0xf1, 0xb2, 0x26, 0x94, 0x9b, 0xd6, 0xeb,
+	0x56, 0xb1, 0x83, 0x82, 0x9a, 0x14, 0xe0, 0x00,
+	0x30, 0xd1, 0xf3, 0xee, 0xf2, 0x80, 0x8e, 0x19,
+	0xe7, 0xfc, 0xdf, 0x56, 0xdc, 0xd9, 0x06, 0x24
+};
+
+
+void ed25519_add(ge_p3 *r,
+		 const ge_p3 *p1, const ge_p3 *p2)
+{
+	/* Explicit formulas database: add-2008-hwcd-3
+	 *
+	 * source 2008 Hisil--Wong--Carter--Dawson,
+	 *     http://eprint.iacr.org/2008/522, Section 3.1
+	 * appliesto extended-1
+	 * parameter k
+	 * assume k = 2 d
+	 * compute A = (Y1-X1)(Y2-X2)
+	 * compute B = (Y1+X1)(Y2+X2)
+	 * compute C = T1 k T2
+	 * compute D = Z1 2 Z2
+	 * compute E = B - A
+	 * compute F = D - C
+	 * compute G = D + C
+	 * compute H = B + A
+	 * compute X3 = E F
+	 * compute Y3 = G H
+	 * compute T3 = E H
+	 * compute Z3 = F G
+	 */
+	byte a[F25519_SIZE];
+	byte b[F25519_SIZE];
+	byte c[F25519_SIZE];
+	byte d[F25519_SIZE];
+	byte e[F25519_SIZE];
+	byte f[F25519_SIZE];
+	byte g[F25519_SIZE];
+	byte h[F25519_SIZE];
+
+	/* A = (Y1-X1)(Y2-X2) */
+	fe_sub(c, p1->Y, p1->X);
+	fe_sub(d, p2->Y, p2->X);
+	fe_mul__distinct(a, c, d);
+
+	/* B = (Y1+X1)(Y2+X2) */
+	fe_add(c, p1->Y, p1->X);
+	fe_add(d, p2->Y, p2->X);
+	fe_mul__distinct(b, c, d);
+
+	/* C = T1 k T2 */
+	fe_mul__distinct(d, p1->T, p2->T);
+	fe_mul__distinct(c, d, ed25519_k);
+
+	/* D = Z1 2 Z2 */
+	fe_mul__distinct(d, p1->Z, p2->Z);
+	fe_add(d, d, d);
+
+	/* E = B - A */
+	fe_sub(e, b, a);
+
+	/* F = D - C */
+	fe_sub(f, d, c);
+
+	/* G = D + C */
+	fe_add(g, d, c);
+
+	/* H = B + A */
+	fe_add(h, b, a);
+
+	/* X3 = E F */
+	fe_mul__distinct(r->X, e, f);
+
+	/* Y3 = G H */
+	fe_mul__distinct(r->Y, g, h);
+
+	/* T3 = E H */
+	fe_mul__distinct(r->T, e, h);
+
+	/* Z3 = F G */
+	fe_mul__distinct(r->Z, f, g);
+}
+
+
+void ed25519_double(ge_p3 *r, const ge_p3 *p)
+{
+	/* Explicit formulas database: dbl-2008-hwcd
+	 *
+	 * source 2008 Hisil--Wong--Carter--Dawson,
+	 *     http://eprint.iacr.org/2008/522, Section 3.3
+	 * compute A = X1^2
+	 * compute B = Y1^2
+	 * compute C = 2 Z1^2
+	 * compute D = a A
+	 * compute E = (X1+Y1)^2-A-B
+	 * compute G = D + B
+	 * compute F = G - C
+	 * compute H = D - B
+	 * compute X3 = E F
+	 * compute Y3 = G H
+	 * compute T3 = E H
+	 * compute Z3 = F G
+	 */
+	byte a[F25519_SIZE];
+	byte b[F25519_SIZE];
+	byte c[F25519_SIZE];
+	byte e[F25519_SIZE];
+	byte f[F25519_SIZE];
+	byte g[F25519_SIZE];
+	byte h[F25519_SIZE];
+
+	/* A = X1^2 */
+	fe_mul__distinct(a, p->X, p->X);
+
+	/* B = Y1^2 */
+	fe_mul__distinct(b, p->Y, p->Y);
+
+	/* C = 2 Z1^2 */
+	fe_mul__distinct(c, p->Z, p->Z);
+	fe_add(c, c, c);
+
+	/* D = a A (alter sign) */
+	/* E = (X1+Y1)^2-A-B */
+	fe_add(f, p->X, p->Y);
+	fe_mul__distinct(e, f, f);
+	fe_sub(e, e, a);
+	fe_sub(e, e, b);
+
+	/* G = D + B */
+	fe_sub(g, b, a);
+
+	/* F = G - C */
+	fe_sub(f, g, c);
+
+	/* H = D - B */
+	fe_neg(h, b);
+	fe_sub(h, h, a);
+
+	/* X3 = E F */
+	fe_mul__distinct(r->X, e, f);
+
+	/* Y3 = G H */
+	fe_mul__distinct(r->Y, g, h);
+
+	/* T3 = E H */
+	fe_mul__distinct(r->T, e, h);
+
+	/* Z3 = F G */
+	fe_mul__distinct(r->Z, f, g);
+}
+
+
+void ed25519_smult(ge_p3 *r_out, const ge_p3 *p, const byte *e)
+{
+	ge_p3 r;
+	int   i;
+
+    XMEMCPY(&r, &ed25519_neutral, sizeof(r));
+
+	for (i = 255; i >= 0; i--) {
+		const byte bit = (e[i >> 3] >> (i & 7)) & 1;
+		ge_p3 s;
+
+		ed25519_double(&r, &r);
+		ed25519_add(&s, &r, p);
+
+		fe_select(r.X, r.X, s.X, bit);
+		fe_select(r.Y, r.Y, s.Y, bit);
+		fe_select(r.Z, r.Z, s.Z, bit);
+		fe_select(r.T, r.T, s.T, bit);
+	}
+    XMEMCPY(r_out, &r, sizeof(r));
+}
+
+
+void ge_scalarmult_base(ge_p3 *R,const unsigned char *nonce)
+{
+	ed25519_smult(R, &ed25519_base, nonce);
+}
+
+
+/* pack the point h into array s */
+void ge_p3_tobytes(unsigned char *s,const ge_p3 *h)
+{
+	byte x[F25519_SIZE];
+	byte y[F25519_SIZE];
+	byte z1[F25519_SIZE];
+	byte parity;
+
+	fe_inv__distinct(z1, h->Z);
+	fe_mul__distinct(x, h->X, z1);
+	fe_mul__distinct(y, h->Y, z1);
+
+	fe_normalize(x);
+	fe_normalize(y);
+
+	parity = (x[0] & 1) << 7;
+	fe_copy(s, y);
+	fe_normalize(s);
+	s[31] |= parity;
+}
+
+
+/* pack the point h into array s */
+void ge_tobytes(unsigned char *s,const ge_p2 *h)
+{
+	byte x[F25519_SIZE];
+	byte y[F25519_SIZE];
+	byte z1[F25519_SIZE];
+	byte parity;
+
+	fe_inv__distinct(z1, h->Z);
+	fe_mul__distinct(x, h->X, z1);
+	fe_mul__distinct(y, h->Y, z1);
+
+	fe_normalize(x);
+	fe_normalize(y);
+
+	parity = (x[0] & 1) << 7;
+	fe_copy(s, y);
+	fe_normalize(s);
+	s[31] |= parity;
+}
+
+
+/*
+   Test if the public key can be uncommpressed and negate it (-X,Y,Z,-T)
+   return 0 on success
+ */
+int ge_frombytes_negate_vartime(ge_p3 *p,const unsigned char *s)
+{
+
+	byte parity;
+    byte x[F25519_SIZE];
+	byte y[F25519_SIZE];
+	byte a[F25519_SIZE];
+	byte b[F25519_SIZE];
+	byte c[F25519_SIZE];
+    int ret = 0;
+
+    /* unpack the key s */
+    parity = s[31] >> 7;
+    fe_copy(y, s);
+	y[31] &= 127;
+
+	fe_mul__distinct(c, y, y);
+    fe_mul__distinct(b, c, ed25519_d);
+	fe_add(a, b, f25519_one);
+	fe_inv__distinct(b, a);
+	fe_sub(a, c, f25519_one);
+	fe_mul__distinct(c, a, b);
+	fe_sqrt(a, c);
+	fe_neg(b, a);
+	fe_select(x, a, b, (a[0] ^ parity) & 1);
+
+    /* test that x^2 is equal to c */
+    fe_mul__distinct(a, x, x);
+	fe_normalize(a);
+	fe_normalize(c);
+	ret |= ConstantCompare(a, c, F25519_SIZE);
+
+    /* project the key s onto p */
+	fe_copy(p->X, x);
+	fe_copy(p->Y, y);
+	fe_load(p->Z, 1);
+	fe_mul__distinct(p->T, x, y);
+
+    /* negate, the point becomes (-X,Y,Z,-T) */
+    fe_neg(p->X,p->X);
+    fe_neg(p->T,p->T);
+
+    return ret;
+}
+
+
+int ge_double_scalarmult_vartime(ge_p2* R, const unsigned char *h,
+                                 const ge_p3 *inA,const unsigned char *sig)
+{
+    ge_p3 p, A;
+    int ret = 0;
+
+    XMEMCPY(&A, inA, sizeof(ge_p3));
+
+    /* find SB */
+    ed25519_smult(&p, &ed25519_base, sig);
+
+    /* find H(R,A,M) * -A */
+	ed25519_smult(&A, &A, h);
+
+    /* SB + -H(R,A,M)A */
+	ed25519_add(&A, &p, &A);
+
+    fe_copy(R->X, A.X);
+    fe_copy(R->Y, A.Y);
+    fe_copy(R->Z, A.Z);
+
+    return ret;
+}
+
+#endif /* HAVE_ED25519 */
+
diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c
index 2a4885ae894..665cfe89b0a 100644
--- a/wolfcrypt/src/ge_operations.c
+++ b/wolfcrypt/src/ge_operations.c
@@ -51,12 +51,670 @@ where d = -121665/121666.
   ge_p1p1 (completed): ((X:Z),(Y:T)) satisfying x=X/Z, y=Y/T
   ge_precomp (Duif): (y+x,y-x,2dxy)
 */
+
+
 /*
+Input:
+  s[0]+256*s[1]+...+256^63*s[63] = s
+
+Output:
+  s[0]+256*s[1]+...+256^31*s[31] = s mod l
+  where l = 2^252 + 27742317777372353535851937790883648493.
+  Overwrites s in place.
+*/
+void sc_reduce(byte* s)
+{
+  int64_t s0 = 2097151 & load_3(s);
+  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
+  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
+  int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
+  int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
+  int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
+  int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
+  int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
+  int64_t s8 = 2097151 & load_3(s + 21);
+  int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
+  int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
+  int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
+  int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
+  int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
+  int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
+  int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
+  int64_t s16 = 2097151 & load_3(s + 42);
+  int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
+  int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
+  int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
+  int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
+  int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
+  int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
+  int64_t s23 = (load_4(s + 60) >> 3);
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
+  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
+  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
+
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
+  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
+
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry11 = s11 >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+
+  /* hush warnings after setting values to 0 */
+  (void)s12;
+  (void)s13;
+  (void)s14;
+  (void)s15;
+  (void)s16;
+  (void)s17;
+  (void)s18;
+  (void)s19;
+  (void)s20;
+  (void)s21;
+  (void)s22;
+  (void)s23;
+}
 
 
-r = p + q
+/*
+Input:
+  a[0]+256*a[1]+...+256^31*a[31] = a
+  b[0]+256*b[1]+...+256^31*b[31] = b
+  c[0]+256*c[1]+...+256^31*c[31] = c
+
+Output:
+  s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
+  where l = 2^252 + 27742317777372353535851937790883648493.
 */
+void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
+{
+  int64_t a0 = 2097151 & load_3(a);
+  int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
+  int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
+  int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
+  int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
+  int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
+  int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
+  int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
+  int64_t a8 = 2097151 & load_3(a + 21);
+  int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
+  int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
+  int64_t a11 = (load_4(a + 28) >> 7);
+  int64_t b0 = 2097151 & load_3(b);
+  int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
+  int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
+  int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
+  int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
+  int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
+  int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
+  int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
+  int64_t b8 = 2097151 & load_3(b + 21);
+  int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
+  int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
+  int64_t b11 = (load_4(b + 28) >> 7);
+  int64_t c0 = 2097151 & load_3(c);
+  int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
+  int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
+  int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
+  int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
+  int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
+  int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
+  int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
+  int64_t c8 = 2097151 & load_3(c + 21);
+  int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
+  int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
+  int64_t c11 = (load_4(c + 28) >> 7);
+  int64_t s0;
+  int64_t s1;
+  int64_t s2;
+  int64_t s3;
+  int64_t s4;
+  int64_t s5;
+  int64_t s6;
+  int64_t s7;
+  int64_t s8;
+  int64_t s9;
+  int64_t s10;
+  int64_t s11;
+  int64_t s12;
+  int64_t s13;
+  int64_t s14;
+  int64_t s15;
+  int64_t s16;
+  int64_t s17;
+  int64_t s18;
+  int64_t s19;
+  int64_t s20;
+  int64_t s21;
+  int64_t s22;
+  int64_t s23;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+  int64_t carry10;
+  int64_t carry11;
+  int64_t carry12;
+  int64_t carry13;
+  int64_t carry14;
+  int64_t carry15;
+  int64_t carry16;
+  int64_t carry17;
+  int64_t carry18;
+  int64_t carry19;
+  int64_t carry20;
+  int64_t carry21;
+  int64_t carry22;
+
+  s0 = c0 + a0*b0;
+  s1 = c1 + a0*b1 + a1*b0;
+  s2 = c2 + a0*b2 + a1*b1 + a2*b0;
+  s3 = c3 + a0*b3 + a1*b2 + a2*b1 + a3*b0;
+  s4 = c4 + a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0;
+  s5 = c5 + a0*b5 + a1*b4 + a2*b3 + a3*b2 + a4*b1 + a5*b0;
+  s6 = c6 + a0*b6 + a1*b5 + a2*b4 + a3*b3 + a4*b2 + a5*b1 + a6*b0;
+  s7 = c7 + a0*b7 + a1*b6 + a2*b5 + a3*b4 + a4*b3 + a5*b2 + a6*b1 + a7*b0;
+  s8 = c8 + a0*b8 + a1*b7 + a2*b6 + a3*b5 + a4*b4 + a5*b3 + a6*b2 + a7*b1
+          + a8*b0;
+  s9 = c9 + a0*b9 + a1*b8 + a2*b7 + a3*b6 + a4*b5 + a5*b4 + a6*b3 + a7*b2
+          + a8*b1 + a9*b0;
+  s10 = c10 + a0*b10 + a1*b9 + a2*b8 + a3*b7 + a4*b6 + a5*b5 + a6*b4 + a7*b3
+            + a8*b2 + a9*b1 + a10*b0;
+  s11 = c11 + a0*b11 + a1*b10 + a2*b9 + a3*b8 + a4*b7 + a5*b6 + a6*b5 + a7*b4
+            + a8*b3 + a9*b2 + a10*b1 + a11*b0;
+  s12 = a1*b11 + a2*b10 + a3*b9 + a4*b8 + a5*b7 + a6*b6 + a7*b5 + a8*b4 + a9*b3
+               + a10*b2 + a11*b1;
+  s13 = a2*b11 + a3*b10 + a4*b9 + a5*b8 + a6*b7 + a7*b6 + a8*b5 + a9*b4 + a10*b3
+               + a11*b2;
+  s14 = a3*b11 + a4*b10 + a5*b9 + a6*b8 + a7*b7 + a8*b6 + a9*b5 + a10*b4
+               + a11*b3;
+  s15 = a4*b11 + a5*b10 + a6*b9 + a7*b8 + a8*b7 + a9*b6 + a10*b5 + a11*b4;
+  s16 = a5*b11 + a6*b10 + a7*b9 + a8*b8 + a9*b7 + a10*b6 + a11*b5;
+  s17 = a6*b11 + a7*b10 + a8*b9 + a9*b8 + a10*b7 + a11*b6;
+  s18 = a7*b11 + a8*b10 + a9*b9 + a10*b8 + a11*b7;
+  s19 = a8*b11 + a9*b10 + a10*b9 + a11*b8;
+  s20 = a9*b11 + a10*b10 + a11*b9;
+  s21 = a10*b11 + a11*b10;
+  s22 = a11*b11;
+  s23 = 0;
+
+  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
+  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
+  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
+  carry18 = (s18 + (1<<20)) >> 21; s19 += carry18; s18 -= carry18 << 21;
+  carry20 = (s20 + (1<<20)) >> 21; s21 += carry20; s20 -= carry20 << 21;
+  carry22 = (s22 + (1<<20)) >> 21; s23 += carry22; s22 -= carry22 << 21;
+
+  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
+  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
+  carry17 = (s17 + (1<<20)) >> 21; s18 += carry17; s17 -= carry17 << 21;
+  carry19 = (s19 + (1<<20)) >> 21; s20 += carry19; s19 -= carry19 << 21;
+  carry21 = (s21 + (1<<20)) >> 21; s22 += carry21; s21 -= carry21 << 21;
+
+  s11 += s23 * 666643;
+  s12 += s23 * 470296;
+  s13 += s23 * 654183;
+  s14 -= s23 * 997805;
+  s15 += s23 * 136657;
+  s16 -= s23 * 683901;
+  s23 = 0;
+
+  s10 += s22 * 666643;
+  s11 += s22 * 470296;
+  s12 += s22 * 654183;
+  s13 -= s22 * 997805;
+  s14 += s22 * 136657;
+  s15 -= s22 * 683901;
+  s22 = 0;
+
+  s9 += s21 * 666643;
+  s10 += s21 * 470296;
+  s11 += s21 * 654183;
+  s12 -= s21 * 997805;
+  s13 += s21 * 136657;
+  s14 -= s21 * 683901;
+  s21 = 0;
+
+  s8 += s20 * 666643;
+  s9 += s20 * 470296;
+  s10 += s20 * 654183;
+  s11 -= s20 * 997805;
+  s12 += s20 * 136657;
+  s13 -= s20 * 683901;
+  s20 = 0;
+
+  s7 += s19 * 666643;
+  s8 += s19 * 470296;
+  s9 += s19 * 654183;
+  s10 -= s19 * 997805;
+  s11 += s19 * 136657;
+  s12 -= s19 * 683901;
+  s19 = 0;
+
+  s6 += s18 * 666643;
+  s7 += s18 * 470296;
+  s8 += s18 * 654183;
+  s9 -= s18 * 997805;
+  s10 += s18 * 136657;
+  s11 -= s18 * 683901;
+  s18 = 0;
+
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry12 = (s12 + (1<<20)) >> 21; s13 += carry12; s12 -= carry12 << 21;
+  carry14 = (s14 + (1<<20)) >> 21; s15 += carry14; s14 -= carry14 << 21;
+  carry16 = (s16 + (1<<20)) >> 21; s17 += carry16; s16 -= carry16 << 21;
+
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+  carry13 = (s13 + (1<<20)) >> 21; s14 += carry13; s13 -= carry13 << 21;
+  carry15 = (s15 + (1<<20)) >> 21; s16 += carry15; s15 -= carry15 << 21;
+
+  s5 += s17 * 666643;
+  s6 += s17 * 470296;
+  s7 += s17 * 654183;
+  s8 -= s17 * 997805;
+  s9 += s17 * 136657;
+  s10 -= s17 * 683901;
+  s17 = 0;
+
+  s4 += s16 * 666643;
+  s5 += s16 * 470296;
+  s6 += s16 * 654183;
+  s7 -= s16 * 997805;
+  s8 += s16 * 136657;
+  s9 -= s16 * 683901;
+  s16 = 0;
+
+  s3 += s15 * 666643;
+  s4 += s15 * 470296;
+  s5 += s15 * 654183;
+  s6 -= s15 * 997805;
+  s7 += s15 * 136657;
+  s8 -= s15 * 683901;
+  s15 = 0;
+
+  s2 += s14 * 666643;
+  s3 += s14 * 470296;
+  s4 += s14 * 654183;
+  s5 -= s14 * 997805;
+  s6 += s14 * 136657;
+  s7 -= s14 * 683901;
+  s14 = 0;
+
+  s1 += s13 * 666643;
+  s2 += s13 * 470296;
+  s3 += s13 * 654183;
+  s4 -= s13 * 997805;
+  s5 += s13 * 136657;
+  s6 -= s13 * 683901;
+  s13 = 0;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = (s0 + (1<<20)) >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry2 = (s2 + (1<<20)) >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry4 = (s4 + (1<<20)) >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry6 = (s6 + (1<<20)) >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry8 = (s8 + (1<<20)) >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry10 = (s10 + (1<<20)) >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  carry1 = (s1 + (1<<20)) >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry3 = (s3 + (1<<20)) >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry5 = (s5 + (1<<20)) >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry7 = (s7 + (1<<20)) >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry9 = (s9 + (1<<20)) >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry11 = (s11 + (1<<20)) >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+  carry11 = s11 >> 21; s12 += carry11; s11 -= carry11 << 21;
+
+  s0 += s12 * 666643;
+  s1 += s12 * 470296;
+  s2 += s12 * 654183;
+  s3 -= s12 * 997805;
+  s4 += s12 * 136657;
+  s5 -= s12 * 683901;
+  s12 = 0;
+
+  carry0 = s0 >> 21; s1 += carry0; s0 -= carry0 << 21;
+  carry1 = s1 >> 21; s2 += carry1; s1 -= carry1 << 21;
+  carry2 = s2 >> 21; s3 += carry2; s2 -= carry2 << 21;
+  carry3 = s3 >> 21; s4 += carry3; s3 -= carry3 << 21;
+  carry4 = s4 >> 21; s5 += carry4; s4 -= carry4 << 21;
+  carry5 = s5 >> 21; s6 += carry5; s5 -= carry5 << 21;
+  carry6 = s6 >> 21; s7 += carry6; s6 -= carry6 << 21;
+  carry7 = s7 >> 21; s8 += carry7; s7 -= carry7 << 21;
+  carry8 = s8 >> 21; s9 += carry8; s8 -= carry8 << 21;
+  carry9 = s9 >> 21; s10 += carry9; s9 -= carry9 << 21;
+  carry10 = s10 >> 21; s11 += carry10; s10 -= carry10 << 21;
+
+  s[0] = s0 >> 0;
+  s[1] = s0 >> 8;
+  s[2] = (s0 >> 16) | (s1 << 5);
+  s[3] = s1 >> 3;
+  s[4] = s1 >> 11;
+  s[5] = (s1 >> 19) | (s2 << 2);
+  s[6] = s2 >> 6;
+  s[7] = (s2 >> 14) | (s3 << 7);
+  s[8] = s3 >> 1;
+  s[9] = s3 >> 9;
+  s[10] = (s3 >> 17) | (s4 << 4);
+  s[11] = s4 >> 4;
+  s[12] = s4 >> 12;
+  s[13] = (s4 >> 20) | (s5 << 1);
+  s[14] = s5 >> 7;
+  s[15] = (s5 >> 15) | (s6 << 6);
+  s[16] = s6 >> 2;
+  s[17] = s6 >> 10;
+  s[18] = (s6 >> 18) | (s7 << 3);
+  s[19] = s7 >> 5;
+  s[20] = s7 >> 13;
+  s[21] = s8 >> 0;
+  s[22] = s8 >> 8;
+  s[23] = (s8 >> 16) | (s9 << 5);
+  s[24] = s9 >> 3;
+  s[25] = s9 >> 11;
+  s[26] = (s9 >> 19) | (s10 << 2);
+  s[27] = s10 >> 6;
+  s[28] = (s10 >> 14) | (s11 << 7);
+  s[29] = s11 >> 1;
+  s[30] = s11 >> 9;
+  s[31] = s11 >> 17;
+
+  /* hush warnings after setting values to 0 */
+  (void)s12;
+  (void)s13;
+  (void)s14;
+  (void)s15;
+  (void)s16;
+  (void)s17;
+  (void)s18;
+  (void)s19;
+  (void)s20;
+  (void)s21;
+  (void)s22;
+  (void)s23;
+}
+
+
+int ge_compress_key(byte* out, const byte* xIn, const byte* yIn, word32 keySz)
+{
+    fe     x,y,z;
+    ge_p3  g;
+    byte   bArray[keySz];
+    word32 i;
+
+    fe_0(x);
+    fe_0(y);
+    fe_1(z);
+    fe_frombytes(x, xIn);
+    fe_frombytes(y, yIn);
+
+    fe_copy(g.X, x);
+    fe_copy(g.Y, y);
+    fe_copy(g.Z, z);
 
+    ge_p3_tobytes(bArray, &g);
+
+    for (i = 0; i < keySz; i++) {
+        out[keySz - 1 - i] = bArray[i];
+    }
+
+    return 0;
+}
+
+
+/*
+r = p + q
+*/
 void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q)
 {
 	fe t0;
@@ -89,7 +747,8 @@ static unsigned char equal(signed char b,signed char c)
 
 static unsigned char negative(signed char b)
 {
-  unsigned long long x = b; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
+  unsigned long long x = b; /* 18446744073709551361..18446744073709551615:
+                               yes; 0..255: no */
   x >>= 63; /* 1: yes; 0: no */
   return x;
 }
@@ -1482,7 +2141,6 @@ B is the Ed25519 base point (x,4/5) with x positive.
 Preconditions:
   a[31] <= 127
 */
-
 void ge_scalarmult_base(ge_p3 *h,const unsigned char *a)
 {
   signed char e[64];
@@ -1610,8 +2268,8 @@ where a = a[0]+256*a[1]+...+256^31 a[31].
 and b = b[0]+256*b[1]+...+256^31 b[31].
 B is the Ed25519 base point (x,4/5) with x positive.
 */
-
-void ge_double_scalarmult_vartime(ge_p2 *r,const unsigned char *a,const ge_p3 *A,const unsigned char *b)
+int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, 
+                                 const ge_p3 *A, const unsigned char *b)
 {
   signed char aslide[256];
   signed char bslide[256];
@@ -1661,16 +2319,20 @@ void ge_double_scalarmult_vartime(ge_p2 *r,const unsigned char *a,const ge_p3 *A
 
     ge_p1p1_to_p2(r,&t);
   }
+
+  return 0;
 }
 
 
 static const fe d = {
--10913610,13857413,-15372611,6949391,114729,-8787816,-6275908,-3247719,-18696448,-12055116
+-10913610,13857413,-15372611,6949391,114729,
+-8787816,-6275908,-3247719,-18696448,-12055116
 } ;
 
 
 static const fe sqrtm1 = {
--32595792,-7943725,9377950,3500415,12389472,-272473,-25146209,-2005654,326686,11406482
+-32595792,-7943725,9377950,3500415,12389472,
+-272473,-25146209,-2005654,326686,11406482
 } ;
 
 
@@ -1689,6 +2351,7 @@ int ge_frombytes_negate_vartime(ge_p3 *h,const unsigned char *s)
   fe_sub(u,u,h->Z);       /* u = y^2-1 */
   fe_add(v,v,h->Z);       /* v = dy^2+1 */
 
+
   fe_sq(v3,v);
   fe_mul(v3,v3,v);        /* v3 = v^3 */
   fe_sq(h->X,v3);
@@ -1850,7 +2513,8 @@ r = p
 */
 
 static const fe d2 = {
--21827239,-5839606,-30745221,13898782,229458,15978800,-12551817,-6495438,29715968,9444199
+-21827239,-5839606,-30745221,13898782,229458,
+15978800,-12551817,-6495438,29715968,9444199
 } ;
 
 
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index 6620a5fdad7..ee199092912 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -5728,54 +5728,54 @@ int ed25519_test(void)
 
         if (wc_ed25519_import_private_key(sKeys[i], ED25519_KEY_SIZE, pKeys[i],
                 pKeySz[i], &key) != 0)
-            return -1021;
+            return -1021 - i;
 
         if (wc_ed25519_sign_msg(msgs[i], msgSz[i], out, &outlen, &key)
                 != 0)
-            return -1022;
+            return -1027 - i;
 
         if (XMEMCMP(out, sigs[i], 64))
-            return -1023;
+            return -1033 - i;
 
         /* test verify on good msg */
         if (wc_ed25519_verify_msg(out, outlen, msgs[i], msgSz[i], &verify,
                     &key) != 0 || verify != 1)
-            return -1024;
+            return -1039 - i;
 
         /* test verify on bad msg */
         out[outlen-1] = out[outlen-1] + 1;
         if (wc_ed25519_verify_msg(out, outlen, msgs[i], msgSz[i], &verify,
                     &key) == 0 || verify == 1)
-            return -1025;
+            return -1045 - i;
 
         /* test api for import/exporting keys */
         exportPSz = sizeof(exportPKey);
         exportSSz = sizeof(exportSKey);
         if (wc_ed25519_export_public(&key, exportPKey, &exportPSz) != 0)
-            return -1026;
+            return -1051 - i;
 
         if (wc_ed25519_import_public(exportPKey, exportPSz, &key2) != 0)
-            return -1027;
+            return -1057 - i;
 
         if (wc_ed25519_export_private_only(&key, exportSKey, &exportSSz) != 0)
-            return -1028;
+            return -1063 - i;
 
         if (wc_ed25519_import_private_key(exportSKey, exportSSz,
                                           exportPKey, exportPSz, &key2) != 0)
-            return -1029;
+            return -1069 - i;
 
         /* clear "out" buffer and test sign with imported keys */
         outlen = sizeof(out);
         XMEMSET(out, 0, sizeof(out));
         if (wc_ed25519_sign_msg(msgs[i], msgSz[i], out, &outlen, &key2) != 0)
-            return -1030;
+            return -1075 - i;
 
         if (wc_ed25519_verify_msg(out, outlen, msgs[i], msgSz[i], &verify,
                                   &key2) != 0 || verify != 1)
-            return -1031;
+            return -1081 - i;
 
         if (XMEMCMP(out, sigs[i], 64))
-            return -1032;
+            return -1087 - i;
     }
 
     /* clean up keys when done */
diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h
index 3cd49015a49..a779f2be7a6 100644
--- a/wolfssl/wolfcrypt/fe_operations.h
+++ b/wolfssl/wolfcrypt/fe_operations.h
@@ -19,9 +19,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  */
 
- /* Based On Daniel J Bernstein's curve25519 and ed25519 Public Domain ref10
-    work. */
-
 #ifndef WOLF_CRYPT_FE_OPERATIONS_H
 #define WOLF_CRYPT_FE_OPERATIONS_H
 
@@ -29,7 +26,10 @@
 
 #if defined(HAVE_CURVE25519) || defined(HAVE_ED25519)
 
-#include <stdint.h>
+#ifndef CURVED25519_SMALL
+    #include <stdint.h>
+#endif
+#include <wolfssl/wolfcrypt/types.h>
 
 /*
 fe means field element.
@@ -39,29 +39,94 @@ t[0]+2^26 t[1]+2^51 t[2]+2^77 t[3]+2^102 t[4]+...+2^230 t[9].
 Bounds on each t[i] vary depending on context.
 */
 
-typedef int32_t fe[10];
+#ifdef CURVED25519_SMALL
+    #define F25519_SIZE	32
+    typedef byte     fe[32];
+#else
+    typedef int32_t  fe[10];
+#endif
 
-WOLFSSL_LOCAL void fe_0(fe);
-WOLFSSL_LOCAL void fe_1(fe);
+WOLFSSL_LOCAL int  curve25519(byte * q, byte * n, byte * p);
+WOLFSSL_LOCAL void fe_copy(fe, const fe);
 WOLFSSL_LOCAL void fe_add(fe, const fe, const fe);
-WOLFSSL_LOCAL void fe_tobytes(unsigned char *, const fe);
+WOLFSSL_LOCAL void fe_neg(fe,const fe);
 WOLFSSL_LOCAL void fe_sub(fe, const fe, const fe);
 WOLFSSL_LOCAL void fe_invert(fe, const fe);
+WOLFSSL_LOCAL void fe_mul(fe,const fe,const fe);
+
+/* default to be faster but take more memory */
+#ifndef CURVED25519_SMALL
+
+/* Based On Daniel J Bernstein's curve25519 and ed25519 Public Domain ref10
+   work. */
+
+WOLFSSL_LOCAL void fe_0(fe);
+WOLFSSL_LOCAL void fe_1(fe);
+WOLFSSL_LOCAL int  fe_isnonzero(const fe);
+WOLFSSL_LOCAL int  fe_isnegative(const fe);
+WOLFSSL_LOCAL void fe_tobytes(unsigned char *, const fe);
 WOLFSSL_LOCAL void fe_sq(fe, const fe);
 WOLFSSL_LOCAL void fe_sq2(fe,const fe);
 WOLFSSL_LOCAL void fe_frombytes(fe,const unsigned char *);
-WOLFSSL_LOCAL void fe_mul(fe,const fe,const fe);
-WOLFSSL_LOCAL void fe_copy(fe, const fe);
 WOLFSSL_LOCAL void fe_cswap(fe,fe,unsigned int);
 WOLFSSL_LOCAL void fe_mul121666(fe,fe);
-WOLFSSL_LOCAL int fe_isnonzero(const fe);
-WOLFSSL_LOCAL int fe_isnegative(const fe);
 WOLFSSL_LOCAL void fe_cmov(fe,const fe,unsigned int);
-WOLFSSL_LOCAL void fe_neg(fe,const fe);
 WOLFSSL_LOCAL void fe_pow22523(fe,const fe);
+
+/* 64 type needed for SHA512 */
 WOLFSSL_LOCAL uint64_t load_3(const unsigned char *in);
 WOLFSSL_LOCAL uint64_t load_4(const unsigned char *in);
+#endif /* not defined CURVED25519_SMALL */
+
+/* Use less memory and only 32bit types or less, but is slower
+   Based on Daniel Beer's public domain work. */
+#ifdef CURVED25519_SMALL
+static const byte c25519_base_x[F25519_SIZE] = {9};
+static const byte f25519_zero[F25519_SIZE]   = {0};
+static const byte f25519_one[F25519_SIZE]    = {1};
+static const byte fprime_zero[F25519_SIZE]   = {0};
+static const byte fprime_one[F25519_SIZE]    = {1};
+
+WOLFSSL_LOCAL void fe_load(byte *x, word32 c);
+WOLFSSL_LOCAL void fe_normalize(byte *x);
+WOLFSSL_LOCAL void fe_inv__distinct(byte *r, const byte *x);
+
+/* Conditional copy. If condition == 0, then zero is copied to dst. If
+ * condition == 1, then one is copied to dst. Any other value results in
+ * undefined behaviour.
+ */
+WOLFSSL_LOCAL void fe_select(byte *dst, const byte *zero, const byte *one,
+		   byte condition);
 
+/* Multiply a point by a small constant. The two pointers are not
+ * required to be distinct.
+ *
+ * The constant must be less than 2^24.
+ */
+WOLFSSL_LOCAL void fe_mul_c(byte *r, const byte *a, word32 b);
+WOLFSSL_LOCAL void fe_mul__distinct(byte *r, const byte *a, const byte *b);
+
+/* Compute one of the square roots of the field element, if the element
+ * is square. The other square is -r.
+ *
+ * If the input is not square, the returned value is a valid field
+ * element, but not the correct answer. If you don't already know that
+ * your element is square, you should square the return value and test.
+ */
+WOLFSSL_LOCAL void fe_sqrt(byte *r, const byte *x);
+
+/* Conditional copy. If condition == 0, then zero is copied to dst. If
+ * condition == 1, then one is copied to dst. Any other value results in
+ * undefined behaviour.
+ */
+WOLFSSL_LOCAL void fprime_select(byte *dst, const byte *zero, const byte *one,
+		                         byte condition);
+WOLFSSL_LOCAL void fprime_add(byte *r, const byte *a, const byte *modulus);
+WOLFSSL_LOCAL void fprime_sub(byte *r, const byte *a, const byte *modulus);
+WOLFSSL_LOCAL void fprime_mul(byte *r, const byte *a, const byte *b,
+		                      const byte *modulus);
+WOLFSSL_LOCAL void fprime_copy(byte *x, const byte *a);
+#endif /* CURVED25519_SMALL */
 #endif /* HAVE_CURVE25519 or HAVE_ED25519 */
 #endif /* WOLF_CRYPT_FE_OPERATIONS_H */
 
diff --git a/wolfssl/wolfcrypt/ge_operations.h b/wolfssl/wolfcrypt/ge_operations.h
index 8cd09c5475a..00d1b3edc67 100644
--- a/wolfssl/wolfcrypt/ge_operations.h
+++ b/wolfssl/wolfcrypt/ge_operations.h
@@ -28,7 +28,9 @@
 
 #ifdef HAVE_ED25519
 
-#include <stdint.h>
+#ifndef CURVED25519_SMALL
+    #include <stdint.h>
+#endif
 #include <wolfssl/wolfcrypt/fe_operations.h>
 
 /*
@@ -59,6 +61,20 @@ typedef struct {
   fe T;
 } ge_p3;
 
+WOLFSSL_LOCAL int  ge_compress_key(byte* out, const byte* xIn, const byte* yIn,
+                                                                word32 keySz);
+WOLFSSL_LOCAL int  ge_frombytes_negate_vartime(ge_p3 *,const unsigned char *);
+
+WOLFSSL_LOCAL int  ge_double_scalarmult_vartime(ge_p2 *,const unsigned char *,
+                                         const ge_p3 *,const unsigned char *);
+WOLFSSL_LOCAL void ge_scalarmult_base(ge_p3 *,const unsigned char *);
+WOLFSSL_LOCAL void sc_reduce(byte* s);
+WOLFSSL_LOCAL void sc_muladd(byte* s, const byte* a, const byte* b,
+                             const byte* c);
+WOLFSSL_LOCAL void ge_tobytes(unsigned char *,const ge_p2 *);
+WOLFSSL_LOCAL void ge_p3_tobytes(unsigned char *,const ge_p3 *);
+
+#ifndef CURVED25519_SMALL
 typedef struct {
   fe X;
   fe Y;
@@ -79,10 +95,6 @@ typedef struct {
   fe T2d;
 } ge_cached;
 
-WOLFSSL_LOCAL void ge_tobytes(unsigned char *,const ge_p2 *);
-WOLFSSL_LOCAL void ge_p3_tobytes(unsigned char *,const ge_p3 *);
-WOLFSSL_LOCAL int ge_frombytes_negate_vartime(ge_p3 *,const unsigned char *);
-
 WOLFSSL_LOCAL void ge_p2_0(ge_p2 *);
 WOLFSSL_LOCAL void ge_p3_0(ge_p3 *);
 WOLFSSL_LOCAL void ge_precomp_0(ge_precomp *);
@@ -97,10 +109,7 @@ WOLFSSL_LOCAL void ge_madd(ge_p1p1 *,const ge_p3 *,const ge_precomp *);
 WOLFSSL_LOCAL void ge_msub(ge_p1p1 *,const ge_p3 *,const ge_precomp *);
 WOLFSSL_LOCAL void ge_add(ge_p1p1 *,const ge_p3 *,const ge_cached *);
 WOLFSSL_LOCAL void ge_sub(ge_p1p1 *,const ge_p3 *,const ge_cached *);
-WOLFSSL_LOCAL void ge_scalarmult_base(ge_p3 *,const unsigned char *);
-WOLFSSL_LOCAL void ge_double_scalarmult_vartime(ge_p2 *,const unsigned char *,
-                                         const ge_p3 *,const unsigned char *);
-
+#endif /* no CURVED25519_SMALL */
 #endif /* HAVE_ED25519 */
 #endif /* WOLF_CRYPT_GE_OPERATIONS_H */