various changes, cleanup for the release

small fixes to handle better the multi thread per gpu explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
tpruvot · Nov 4, 2015 · e50556b · e50556b
1 parent 1e3db41
commit e50556b
Show file tree

Hide file tree

Showing 11 changed files with 84 additions and 88 deletions.
diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu
@@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 __host__
 void fugue256_cpu_init(int thr_id, uint32_t threads)
 {
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	// Link the hash tables in the GPU
 	texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
 	texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
 	texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
 	texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
 
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads);
+	CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads));
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
 
@@ -741,32 +740,25 @@ void fugue256_cpu_free(int thr_id)
 	cudaFree(d_fugue256_hashoutput[thr_id]);
 	cudaFree(d_resultNonce[thr_id]);
 
-	cudaUnbindTexture(mixTab0Tex);
-	cudaUnbindTexture(mixTab1Tex);
-	cudaUnbindTexture(mixTab2Tex);
-	cudaUnbindTexture(mixTab3Tex);
-
 	for (int i=0; i<4; i++)
 		cudaFree(d_textures[thr_id][i]);
 }
 
 __host__
 void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-	// CPU-Vorbereitungen treffen
 	sph_fugue256_context ctx_fugue_const;
 	sph_fugue256_init(&ctx_fugue_const);
-	sph_fugue256 (&ctx_fugue_const, data, 80);	// State speichern
-
+	sph_fugue256 (&ctx_fugue_const, data, 80);
 	cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);
 
-	cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8);
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
 
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
 }
 
 __host__
-void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
+void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
 #if USE_SHARED
 	const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN

diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
@@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 __host__
 void groestl256_cpu_free(int thr_id)
 {
-	cudaUnbindTexture(t0up2);
-	cudaUnbindTexture(t0dn2);
-	cudaUnbindTexture(t1up2);
-	cudaUnbindTexture(t1dn2);
-	cudaUnbindTexture(t2up2);
-	cudaUnbindTexture(t2dn2);
-	cudaUnbindTexture(t3up2);
-	cudaUnbindTexture(t3dn2);
-
 	for (int i=0; i<8; i++)
 		cudaFree(d_textures[thr_id][i]);
 

diff --git a/README.txt b/README.txt
@@ -228,7 +228,7 @@ features.
 
 >>> RELEASE HISTORY <<<
 
-  Nov. 02nd 2015  v1.7
+  Nov. 05th 2015  v1.7
                   Improve old devices compatibility (x11, lyra2, qubit...)
                   Add windows support for SM 2.1 and drop SM 3.5 (x86)
                   Improve lyra2 (v1/v2) cuda implementations

diff --git a/ccminer.cpp b/ccminer.cpp
@@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg)
 		if (p) d *= 1e9;
 		opt_max_rate = d;
 		break;
-	case 'd': // CB
+	case 'd': // --device
 		{
+			int device_thr[MAX_GPUS] = { 0 };
 			int ngpus = cuda_num_devices();
 			char * pch = strtok (arg,",");
 			opt_n_threads = 0;
-			while (pch != NULL) {
+			while (pch != NULL && opt_n_threads < MAX_GPUS) {
 				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
 				{
 					if (atoi(pch) < ngpus)
@@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg)
 				}
 				pch = strtok (NULL, ",");
 			}
+			// count threads per gpu
+			for (int n=0; n < opt_n_threads; n++) {
+				int device = device_map[n];
+				device_thr[device]++;
+			}
+			for (int n=0; n < ngpus; n++) {
+				gpu_threads = max(gpu_threads, device_thr[n]);
+			}
 		}
 		break;
 
@@ -3177,8 +3186,8 @@ int main(int argc, char *argv[])
 	else if (active_gpus > opt_n_threads)
 		active_gpus = opt_n_threads;
 
-	// generally doesn't work... let 1
-	gpu_threads = opt_n_threads / active_gpus;
+	// generally doesn't work well...
+	gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);
 
 	if (opt_benchmark && opt_algo == ALGO_AUTO) {
 		bench_init(opt_n_threads);

diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
@@ -11,23 +11,27 @@
 __constant__ uint32_t pTarget[8]; // 32 bytes
 
 // store MAX_GPUS device arrays of 8 nonces
-static uint32_t* h_resNonces[MAX_GPUS];
-static uint32_t* d_resNonces[MAX_GPUS];
-static bool init_done = false;
+static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
+static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
+static __thread bool init_done = false;
 
 __host__
 void cuda_check_cpu_init(int thr_id, uint32_t threads)
 {
-    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 32));
     CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
+    CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
     init_done = true;
 }
 
 __host__
 void cuda_check_cpu_free(int thr_id)
 {
+	if (!init_done) return;
 	cudaFree(d_resNonces[thr_id]);
 	cudaFreeHost(h_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+	h_resNonces[thr_id] = NULL;
+	init_done = false;
 }
 
 // Target Difficulty
@@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
 	cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
 	cudaThreadSynchronize();
 
-	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
 	rescnt = h_resNonces[thr_id][0];
 	if (rescnt > numNonce) {
 		if (numNonce <= rescnt) {

diff --git a/cuda_fugue256.h b/cuda_fugue256.h
@@ -1,7 +1,7 @@
-#ifndef _CUDA_FUGUE512_H
-#define _CUDA_FUGUE512_H
+#ifndef _CUDA_FUGUE256_H
+#define _CUDA_FUGUE256_H
 
-void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce);
+void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
 void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
 void fugue256_cpu_init(int thr_id, uint32_t threads);
 void fugue256_cpu_free(int thr_id);

diff --git a/fuguecoin.cpp b/fuguecoin.cpp
@@ -8,14 +8,6 @@
 
 #include "cuda_fugue256.h"
 
-extern "C" void my_fugue256_init(void *cc);
-extern "C" void my_fugue256(void *cc, const void *data, size_t len);
-extern "C" void my_fugue256_close(void *cc, void *dst);
-extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
-
-// vorbereitete Kontexte nach den ersten 80 Bytes
-// sph_fugue256_context  ctx_fugue_const[MAX_GPUS];
-
 #define SWAP32(x) \
     ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
       (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
@@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 	uint32_t *ptarget = work->target;
 	uint32_t start_nonce = pdata[19]++;
 	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
-	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << intensity); // 256*256*8
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xf;
+		ptarget[7] = 0xf;
 
 	// init
 	if(!init[thr_id])
@@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 	for (int kk=0; kk < 20; kk++)
 		be32enc(&endiandata[kk], pdata[kk]);
 
-	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
 	fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
 
 	do {

diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
@@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
 
 		if (device_sm[dev_id] < 300) {
-			applog(LOG_ERR, "Device SM 3.0 or more recent required!");
+			gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
 			proper_exit(1);
 			return -1;
 		}
 
+		api_set_throughput(thr_id, throughput);
 		init[thr_id] = true;
 	}