decred: return to host 2D array to allow the free

tpruvot · Sep 27, 2016 · 92b7d72 · 92b7d72
1 parent 34e264c
commit 92b7d72
Showing 1 changed file with 17 additions and 16 deletions.
diff --git a/Algo256/decred.cu b/Algo256/decred.cu
@@ -43,7 +43,7 @@ __constant__ uint32_t _ALIGN(16) c_xors[215];
 
 /* Buffers of candidate nonce(s) */
 static uint32_t *d_resNonce[MAX_GPUS];
-static __thread uint32_t *h_resNonce;
+static uint32_t *h_resNonce[MAX_GPUS];
 
 #define ROR8(a)  __byte_perm(a, 0, 0x0321)
 #define ROL16(a) __byte_perm(a, 0, 0x1032)
@@ -375,47 +375,48 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], MAX_RESULTS*sizeof(uint32_t)), -1);
-		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce, MAX_RESULTS*sizeof(uint32_t)), -1);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], MAX_RESULTS*sizeof(uint32_t)), -1);
 		init[thr_id] = true;
 	}
 	memcpy(endiandata, pdata, 180);
 
 	decred_cpu_setBlock_52(endiandata);
-	h_resNonce[0] = 1;
+	cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
 
 	do {
-		if (h_resNonce[0])
-			cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
+		uint32_t* resNonces = h_resNonce[thr_id];
+
+		if (resNonces[0]) cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
 
 		// GPU HASH
 		decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh);
 
 		// first cell contains the valid nonces count
-		cudaMemcpy(h_resNonce, d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		cudaMemcpy(resNonces, d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
-		if (h_resNonce[0])
+		if (resNonces[0])
 		{
 			uint32_t _ALIGN(64) vhash[8];
 
-			cudaMemcpy(h_resNonce, d_resNonce[thr_id], (h_resNonce[0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+			cudaMemcpy(resNonces, d_resNonce[thr_id], (resNonces[0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
-			be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[1]);
+			be32enc(&endiandata[DCR_NONCE_OFT32], resNonces[1]);
 			decred_hash(vhash, endiandata);
 			if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
 			{
 				int rc = work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
 				*hashes_done = (*pnonce) - first_nonce + throughput;
-				work->nonces[0] = swab32(h_resNonce[1]);
+				work->nonces[0] = swab32(resNonces[1]);
 				*pnonce = work->nonces[0];
 
 				// search for another nonce
-				for(uint32_t n=2; n <= h_resNonce[0]; n++)
+				for(uint32_t n=2; n <= resNonces[0]; n++)
 				{
-					be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[n]);
+					be32enc(&endiandata[DCR_NONCE_OFT32], resNonces[n]);
 					decred_hash(vhash, endiandata);
 					if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) {
-						work->nonces[1] = swab32(h_resNonce[n]);
+						work->nonces[1] = swab32(resNonces[n]);
 
 						if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
 							// we really want the best first ? depends...
@@ -434,13 +435,13 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 							work->nonces[0], work->sharediff[0], n, work->nonces[1], work->sharediff[1]);
 
 					} else if (vhash[6] > ptarget[6]) {
-						gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", n, h_resNonce[n]);
+						gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", n, resNonces[n]);
 					}
 				}
 				return rc;
 
 			} else if (vhash[6] > ptarget[6]) {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[1]);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[1]);
 			}
 		}
 		*pnonce += throughput;
@@ -459,7 +460,7 @@ extern "C" void free_decred(int thr_id)
 		return;
 
 	cudaDeviceSynchronize();
-	cudaFreeHost(h_resNonce);
+	cudaFreeHost(h_resNonce[thr_id]);
 	cudaFree(d_resNonce[thr_id]);
 
 	init[thr_id] = false;