Skip to content

Commit

Permalink
various changes, cleanup for the release
Browse files Browse the repository at this point in the history
small fixes to handle better the multi thread per gpu

explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
  • Loading branch information
tpruvot committed Nov 4, 2015
1 parent 1e3db41 commit e50556b
Show file tree
Hide file tree
Showing 11 changed files with 84 additions and 88 deletions.
18 changes: 5 additions & 13 deletions Algo256/cuda_fugue256.cu
Expand Up @@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
__host__
void fugue256_cpu_init(int thr_id, uint32_t threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
// Link the hash tables in the GPU
texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);

// Speicher für alle Ergebnisse belegen
cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads);
CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads));
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
}

Expand All @@ -741,32 +740,25 @@ void fugue256_cpu_free(int thr_id)
cudaFree(d_fugue256_hashoutput[thr_id]);
cudaFree(d_resultNonce[thr_id]);

cudaUnbindTexture(mixTab0Tex);
cudaUnbindTexture(mixTab1Tex);
cudaUnbindTexture(mixTab2Tex);
cudaUnbindTexture(mixTab3Tex);

for (int i=0; i<4; i++)
cudaFree(d_textures[thr_id][i]);
}

__host__
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
{
// CPU-Vorbereitungen treffen
sph_fugue256_context ctx_fugue_const;
sph_fugue256_init(&ctx_fugue_const);
sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern

sph_fugue256 (&ctx_fugue_const, data, 80);
cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);

cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8);
cudaMemcpyToSymbol(pTarget, pTargetIn, 32);

cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
}

__host__
void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{
#if USE_SHARED
const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
Expand Down
9 changes: 0 additions & 9 deletions Algo256/cuda_groestl256.cu
Expand Up @@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
__host__
void groestl256_cpu_free(int thr_id)
{
cudaUnbindTexture(t0up2);
cudaUnbindTexture(t0dn2);
cudaUnbindTexture(t1up2);
cudaUnbindTexture(t1dn2);
cudaUnbindTexture(t2up2);
cudaUnbindTexture(t2dn2);
cudaUnbindTexture(t3up2);
cudaUnbindTexture(t3dn2);

for (int i=0; i<8; i++)
cudaFree(d_textures[thr_id][i]);

Expand Down
2 changes: 1 addition & 1 deletion README.txt
Expand Up @@ -228,7 +228,7 @@ features.

>>> RELEASE HISTORY <<<

Nov. 02nd 2015 v1.7
Nov. 05th 2015 v1.7
Improve old devices compatibility (x11, lyra2, qubit...)
Add windows support for SM 2.1 and drop SM 3.5 (x86)
Improve lyra2 (v1/v2) cuda implementations
Expand Down
17 changes: 13 additions & 4 deletions ccminer.cpp
Expand Up @@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg)
if (p) d *= 1e9;
opt_max_rate = d;
break;
case 'd': // CB
case 'd': // --device
{
int device_thr[MAX_GPUS] = { 0 };
int ngpus = cuda_num_devices();
char * pch = strtok (arg,",");
opt_n_threads = 0;
while (pch != NULL) {
while (pch != NULL && opt_n_threads < MAX_GPUS) {
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
{
if (atoi(pch) < ngpus)
Expand All @@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg)
}
pch = strtok (NULL, ",");
}
// count threads per gpu
for (int n=0; n < opt_n_threads; n++) {
int device = device_map[n];
device_thr[device]++;
}
for (int n=0; n < ngpus; n++) {
gpu_threads = max(gpu_threads, device_thr[n]);
}
}
break;

Expand Down Expand Up @@ -3177,8 +3186,8 @@ int main(int argc, char *argv[])
else if (active_gpus > opt_n_threads)
active_gpus = opt_n_threads;

// generally doesn't work... let 1
gpu_threads = opt_n_threads / active_gpus;
// generally doesn't work well...
gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);

if (opt_benchmark && opt_algo == ALGO_AUTO) {
bench_init(opt_n_threads);
Expand Down
14 changes: 9 additions & 5 deletions cuda_checkhash.cu
Expand Up @@ -11,23 +11,27 @@
__constant__ uint32_t pTarget[8]; // 32 bytes

// store MAX_GPUS device arrays of 8 nonces
static uint32_t* h_resNonces[MAX_GPUS];
static uint32_t* d_resNonces[MAX_GPUS];
static bool init_done = false;
static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
static __thread bool init_done = false;

__host__
void cuda_check_cpu_init(int thr_id, uint32_t threads)
{
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 32));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
init_done = true;
}

__host__
void cuda_check_cpu_free(int thr_id)
{
if (!init_done) return;
cudaFree(d_resNonces[thr_id]);
cudaFreeHost(h_resNonces[thr_id]);
d_resNonces[thr_id] = NULL;
h_resNonces[thr_id] = NULL;
init_done = false;
}

// Target Difficulty
Expand Down Expand Up @@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();

cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
rescnt = h_resNonces[thr_id][0];
if (rescnt > numNonce) {
if (numNonce <= rescnt) {
Expand Down
6 changes: 3 additions & 3 deletions cuda_fugue256.h
@@ -1,7 +1,7 @@
#ifndef _CUDA_FUGUE512_H
#define _CUDA_FUGUE512_H
#ifndef _CUDA_FUGUE256_H
#define _CUDA_FUGUE256_H

void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce);
void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void fugue256_cpu_init(int thr_id, uint32_t threads);
void fugue256_cpu_free(int thr_id);
Expand Down
13 changes: 2 additions & 11 deletions fuguecoin.cpp
Expand Up @@ -8,14 +8,6 @@

#include "cuda_fugue256.h"

extern "C" void my_fugue256_init(void *cc);
extern "C" void my_fugue256(void *cc, const void *data, size_t len);
extern "C" void my_fugue256_close(void *cc, void *dst);
extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);

// vorbereitete Kontexte nach den ersten 80 Bytes
// sph_fugue256_context ctx_fugue_const[MAX_GPUS];

#define SWAP32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
Expand All @@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
uint32_t *ptarget = work->target;
uint32_t start_nonce = pdata[19]++;
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 256*256*8
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);

if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0xf;
ptarget[7] = 0xf;

// init
if(!init[thr_id])
Expand All @@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
for (int kk=0; kk < 20; kk++)
be32enc(&endiandata[kk], pdata[kk]);

// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);

do {
Expand Down
3 changes: 2 additions & 1 deletion lyra2/lyra2REv2.cu
Expand Up @@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));

if (device_sm[dev_id] < 300) {
applog(LOG_ERR, "Device SM 3.0 or more recent required!");
gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!");
proper_exit(1);
return -1;
}

api_set_throughput(thr_id, throughput);
init[thr_id] = true;
}

Expand Down

0 comments on commit e50556b

Please sign in to comment.