Skip to content

Commit

Permalink
decred: multiple nonces code cleanup
Browse files Browse the repository at this point in the history
The double loop is not useful, and prefer the __thread attribute
to enhance the code readability (remove the 2D host arrays).
  • Loading branch information
tpruvot committed Sep 27, 2016
1 parent 9eead77 commit a766969
Showing 1 changed file with 58 additions and 49 deletions.
107 changes: 58 additions & 49 deletions Algo256/decred.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
/**
* Blake-256 Decred 180-Bytes input Cuda Kernel (Tested on SM 5/5.2/6.1)
* Blake-256 Decred 180-Bytes input Cuda Kernel
*
* Tanguy Pruvot - Feb 2016
*
* Merged 8-round blake (XVC) tweaks
* Further improved by: ~2.72%
* Alexis Provos - Jun 2016
* Tanguy Pruvot, Alexis Provos - Feb/Sep 2016
*/

#include <stdint.h>
Expand All @@ -20,7 +16,7 @@ extern "C" {
#define TPB 640

/* max count of found nonces in one call (like sgminer) */
#define maxResults 4
#define MAX_RESULTS 4

/* hash by cpu with blake 256 */
extern "C" void decred_hash(void *output, const void *input)
Expand All @@ -47,7 +43,7 @@ __constant__ uint32_t _ALIGN(16) c_xors[215];

/* Buffers of candidate nonce(s) */
static uint32_t *d_resNonce[MAX_GPUS];
static uint32_t *h_resNonce[MAX_GPUS];
static __thread uint32_t *h_resNonce;

#define ROR8(a) __byte_perm(a, 0, 0x0321)
#define ROL16(a) __byte_perm(a, 0, 0x1032)
Expand Down Expand Up @@ -110,13 +106,13 @@ static uint32_t *h_resNonce[MAX_GPUS];

#define pxorx0GS2(a,b,c,d, a1,b1,c1,d1) { \
v[ a]+= (c_xors[i++]^nonce) + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \
v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); \
v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); \
v[ c]+= v[ d]; v[c1]+= v[d1]; \
v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \
v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); \
v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \
v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); \
v[ c]+= v[ d]; v[c1]+= v[d1]; \
v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
}

__global__ __launch_bounds__(TPB,1)
Expand Down Expand Up @@ -367,7 +363,7 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
const dim3 grid((throughput + TPB-1)/(TPB));
const dim3 block(TPB);

if (!init[thr_id]){
if (!init[thr_id]) {
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
Expand All @@ -378,60 +374,73 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], MAX_RESULTS*sizeof(uint32_t)), -1);
CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce, MAX_RESULTS*sizeof(uint32_t)), -1);
init[thr_id] = true;
}
memcpy(endiandata, pdata, 180);

decred_cpu_setBlock_52(endiandata);
h_resNonce[thr_id][0] = 1;
h_resNonce[0] = 1;

do {
if (h_resNonce[thr_id][0])
if (h_resNonce[0])
cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));

// GPU HASH
decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh);
cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

if (h_resNonce[thr_id][0])
// first cell contains the valid nonces count
cudaMemcpy(h_resNonce, d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

if (h_resNonce[0])
{
cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], (h_resNonce[thr_id][0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
uint32_t _ALIGN(64) vhash[8];

cudaMemcpy(h_resNonce, d_resNonce[thr_id], (h_resNonce[0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);

for(uint32_t i=1; i <= h_resNonce[thr_id][0]; i++)
be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[1]);
decred_hash(vhash, endiandata);
if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
{
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][i]);
decred_hash(vhash, endiandata);
if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
int rc = work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
*hashes_done = (*pnonce) - first_nonce + throughput;
work->nonces[0] = swab32(h_resNonce[1]);
*pnonce = work->nonces[0];

// search for another nonce
for(uint32_t n=2; n <= h_resNonce[0]; n++)
{
int rc = 1;
work_set_target_ratio(work, vhash);
*hashes_done = (*pnonce) - first_nonce + throughput;
work->nonces[0] = swab32(h_resNonce[thr_id][i]);
// search for another nonce
for(uint32_t j=i+1; j <= h_resNonce[thr_id][0]; j++)
{
be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][j]);
decred_hash(vhash, endiandata);
if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)){
work->nonces[1] = swab32(h_resNonce[thr_id][j]);
if(!opt_quiet)
gpulog(LOG_NOTICE, thr_id, "second nonce found %u / %08x - %u / %08x", i, work->nonces[0], j, work->nonces[1]);
if(bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
work_set_target_ratio(work, vhash);
xchg(work->nonces[1], work->nonces[0]);
}
rc = 2;
break;
be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[n]);
decred_hash(vhash, endiandata);
if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) {
work->nonces[1] = swab32(h_resNonce[n]);

if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
// we really want the best first ? depends...
work->shareratio[1] = work->shareratio[0];
work->sharediff[1] = work->sharediff[0];
xchg(work->nonces[1], work->nonces[0]);
work_set_target_ratio(work, vhash);
work->valid_nonces++;
} else if (work->valid_nonces == 1) {
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
}
rc = 2; // MAX_NONCES submit limited to 2

gpulog(LOG_DEBUG, thr_id, "multiple nonces 1:%08x (%g) %u:%08x (%g)",
work->nonces[0], work->sharediff[0], n, work->nonces[1], work->sharediff[1]);

} else if (vhash[6] > ptarget[6]) {
gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", n, h_resNonce[n]);
}
*pnonce = work->nonces[0];
return rc;
} else {
gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", i, h_resNonce[thr_id][i]);
}
return rc;

} else if (vhash[6] > ptarget[6]) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[1]);
}
}
*pnonce += throughput;
Expand All @@ -450,7 +459,7 @@ extern "C" void free_decred(int thr_id)
return;

cudaDeviceSynchronize();
cudaFreeHost(h_resNonce[thr_id]);
cudaFreeHost(h_resNonce);
cudaFree(d_resNonce[thr_id]);

init[thr_id] = false;
Expand Down

0 comments on commit a766969

Please sign in to comment.