Skip to content

Commit

Permalink
Combined hash and fill AES loop (#166)
Browse files Browse the repository at this point in the history
Adds more parallelizm into AES loop so modern CPUs can take advantage of it. Also, scratchpad data moves between L1 and L3 caches only one time which saves time and energy per hash.
  • Loading branch information
SChernykh authored and tevador committed Dec 1, 2019
1 parent e3561d6 commit 219c02e
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 2 deletions.
81 changes: 81 additions & 0 deletions src/aes_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,3 +239,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) {

template void fillAes4Rx4<true>(void *state, size_t outputSize, void *buffer);
template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);

template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;

// initial state
rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0);
rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1);
rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2);
rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3);

const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0);
const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1);
const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2);
const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3);

rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0);
rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1);
rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2);
rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3);

constexpr int PREFETCH_DISTANCE = 4096;
const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE;
scratchpadEnd -= PREFETCH_DISTANCE;

for (int i = 0; i < 2; ++i) {
//process 64 bytes at a time in 4 lanes
while (scratchpadPtr < scratchpadEnd) {
hash_state0 = aesenc<softAes>(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0));
hash_state1 = aesdec<softAes>(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1));
hash_state2 = aesenc<softAes>(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2));
hash_state3 = aesdec<softAes>(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3));

fill_state0 = aesdec<softAes>(fill_state0, key0);
fill_state1 = aesenc<softAes>(fill_state1, key1);
fill_state2 = aesdec<softAes>(fill_state2, key2);
fill_state3 = aesenc<softAes>(fill_state3, key3);

rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2);
rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3);

rx_prefetch_t0(prefetchPtr);

scratchpadPtr += 64;
prefetchPtr += 64;
}
prefetchPtr = (const char*) scratchpad;
scratchpadEnd += PREFETCH_DISTANCE;
}

rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0);
rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1);
rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2);
rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3);

//two extra rounds to achieve full diffusion
rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0);
rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1);

hash_state0 = aesenc<softAes>(hash_state0, xkey0);
hash_state1 = aesdec<softAes>(hash_state1, xkey0);
hash_state2 = aesenc<softAes>(hash_state2, xkey0);
hash_state3 = aesdec<softAes>(hash_state3, xkey0);

hash_state0 = aesenc<softAes>(hash_state0, xkey1);
hash_state1 = aesdec<softAes>(hash_state1, xkey1);
hash_state2 = aesenc<softAes>(hash_state2, xkey1);
hash_state3 = aesdec<softAes>(hash_state3, xkey1);

//output hash
rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0);
rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1);
rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2);
rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3);
}

template void hashAndFillAes1Rx4<false>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
template void hashAndFillAes1Rx4<true>(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
3 changes: 3 additions & 0 deletions src/aes_hash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer);

template<bool softAes>
void fillAes4Rx4(void *state, size_t outputSize, void *buffer);

template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state);
7 changes: 7 additions & 0 deletions src/intrin_portable.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128;
#define rx_aligned_alloc(a, b) _mm_malloc(a,b)
#define rx_aligned_free(a) _mm_free(a)
#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0)

#define rx_load_vec_f128 _mm_load_pd
#define rx_store_vec_f128 _mm_store_pd
Expand Down Expand Up @@ -201,6 +202,7 @@ typedef union{
#define rx_aligned_alloc(a, b) malloc(a)
#define rx_aligned_free(a) free(a)
#define rx_prefetch_nta(x)
#define rx_prefetch_t0(x)

/* Splat 64-bit long long to 2 64-bit long longs */
FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
Expand Down Expand Up @@ -399,6 +401,10 @@ inline void rx_prefetch_nta(void* ptr) {
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
}

inline void rx_prefetch_t0(const void* ptr) {
asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
}

FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
return vld1q_f64((const float64_t*)pd);
}
Expand Down Expand Up @@ -532,6 +538,7 @@ typedef union {
#define rx_aligned_alloc(a, b) malloc(a)
#define rx_aligned_free(a) free(a)
#define rx_prefetch_nta(x)
#define rx_prefetch_t0(x)

FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
rx_vec_f128 x;
Expand Down
17 changes: 17 additions & 0 deletions src/randomx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,4 +363,21 @@ extern "C" {
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
}

void randomx_calculate_hash_first(randomx_vm* machine, uint64_t *tempHash, const void* input, size_t inputSize) {
blake2b(tempHash, sizeof(uint64_t) * 8, input, inputSize, nullptr, 0);
machine->initScratchpad(tempHash);
}

void randomx_calculate_hash_next(randomx_vm* machine, uint64_t *tempHash, const void* nextInput, size_t nextInputSize, void* output) {
machine->resetRoundingMode();
for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) {
machine->run(tempHash);
blake2b(tempHash, sizeof(uint64_t) * 8, machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
}
machine->run(tempHash);

// Finish current hash and fill the scratchpad for the next hash at the same time
blake2b(tempHash, sizeof(uint64_t) * 8, nextInput, nextInputSize, nullptr, 0);
machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash);
}
}
16 changes: 16 additions & 0 deletions src/randomx.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define RANDOMX_H

#include <stddef.h>
#include <stdint.h>

#define RANDOMX_HASH_SIZE 32
#define RANDOMX_DATASET_ITEM_SIZE 64
Expand Down Expand Up @@ -238,6 +239,21 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine);
*/
RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output);

/**
* Paired functions used to calculate multiple RandomX hashes during mining for example.
*
* @param machine is a pointer to a randomx_vm structure. Must not be NULL.
* @param tempHash an array of 8 64-bit values used to store intermediate data between calls to randomx_calculate_hash_first and randomx_calculate_hash_next.
* @param input is a pointer to memory to be hashed. Must not be NULL.
* @param inputSize is the number of bytes to be hashed.
* @param nextInput is a pointer to memory to be hashed for the next hash. Must not be NULL.
* @param nextInputSize is the number of bytes to be hashed for the next hash.
* @param output is a pointer to memory where the hash will be stored. Must not
* be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
*/
RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t *tempHash, const void* input, size_t inputSize);
RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t *tempHash, const void* nextInput, size_t nextInputSize, void* output);

#if defined(__cplusplus)
}
#endif
Expand Down
9 changes: 7 additions & 2 deletions src/tests/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,16 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
void* noncePtr = blockTemplate + 39;
auto nonce = atomicNonce.fetch_add(1);

uint64_t tempHash[8];

store32(noncePtr, nonce);
randomx_calculate_hash_first(vm, tempHash, blockTemplate, sizeof(blockTemplate));

while (nonce < noncesCount) {
nonce = atomicNonce.fetch_add(1);
store32(noncePtr, nonce);
randomx_calculate_hash(vm, blockTemplate, sizeof(blockTemplate), &hash);
randomx_calculate_hash_next(vm, tempHash, blockTemplate, sizeof(blockTemplate), &hash);
result.xorWith(hash);
nonce = atomicNonce.fetch_add(1);
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/virtual_machine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ namespace randomx {
blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
}

template<class Allocator, bool softAes>
void VmBase<Allocator, softAes>::hashAndFill(void* out, size_t outSize, uint64_t *fill_state) {
hashAndFillAes1Rx4<softAes>((void*) getScratchpad(), ScratchpadSize, &reg.a, fill_state);
blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
}

template<class Allocator, bool softAes>
void VmBase<Allocator, softAes>::initScratchpad(void* seed) {
fillAes1Rx4<softAes>(seed, ScratchpadSize, scratchpad);
Expand Down
2 changes: 2 additions & 0 deletions src/virtual_machine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class randomx_vm {
virtual ~randomx_vm() = 0;
virtual void allocate() = 0;
virtual void getFinalResult(void* out, size_t outSize) = 0;
virtual void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) = 0;
virtual void setDataset(randomx_dataset* dataset) { }
virtual void setCache(randomx_cache* cache) { }
virtual void initScratchpad(void* seed) = 0;
Expand Down Expand Up @@ -78,6 +79,7 @@ namespace randomx {
void allocate() override;
void initScratchpad(void* seed) override;
void getFinalResult(void* out, size_t outSize) override;
void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) override;
protected:
void generateProgram(void* seed);
};
Expand Down

0 comments on commit 219c02e

Please sign in to comment.