Skip to content

Commit

Permalink
squashme: Needs AMD benchmark. don't use intel-specific optimization
Browse files Browse the repository at this point in the history
The algorithm used to calculate Sigma0/Sigma1 here is the same as the one used
in bitcoin#13400, which resulted in a ~5% speedup on Intel CPUs, but close to 25%
penalty on AMD.

So unfortunately, this should result in a ~5% slowdown on Intel.

However, bitcoin#13400 was operating on 128bit registers and these are 32bit, it's
possible that there's no penalty on AMD, and this change is not needed.
  • Loading branch information
theuni committed Jun 12, 2018
1 parent d79fb1d commit 4ee6fbb
Showing 1 changed file with 4 additions and 15 deletions.
19 changes: 4 additions & 15 deletions src/crypto/sha256_sse41.cpp
Expand Up @@ -80,21 +80,10 @@ __m128i inline __attribute__((always_inline)) Pxor(__m128i x, __m128i y) { retur

void inline __attribute__((always_inline)) Round(uint32_t& a, uint32_t& b, uint32_t& c, uint32_t& d, uint32_t& e, uint32_t& f, uint32_t& g, uint32_t& h, uint32_t w)
{
uint32_t y0, y1, y2;
y0 = Ror(e, 25 - 11) ^ e;
y1 = Ror(a, 22 - 13) ^ a;
y2 = (f ^ g) & e;
y0 = Ror(y0, 11 - 6) ^ e;
y1 = Ror(y1, 13 - 2) ^ a;
y0 = Ror(y0, 6);
y2 = (y2 ^ g) + y0 + w;
y1 = Ror(y1, 2);
h += y2;
y0 = (a | c) & b;
d += h;
h += y1;
y0 |= (a & c);
h += y0;
uint32_t t1 = (Ror(e, 6) ^ Ror(e, 11) ^ Ror(e, 25)) + (((f ^ g) & e) ^ g) + w;
uint32_t t2 = (Ror(a, 2) ^ Ror(a, 13) ^ Ror(a, 22)) + (((a | c) & b) | (a & c));
d += t1;
h = t1 + t2;
}

void inline __attribute__((always_inline)) QuadRoundSched(uint32_t& a, uint32_t& b, uint32_t& c, uint32_t& d, uint32_t& e, uint32_t& f, uint32_t& g, uint32_t& h, __m128i& X0, __m128i X1, __m128i X2, __m128i X3, __m128i W)
Expand Down

0 comments on commit 4ee6fbb

Please sign in to comment.