Skip to content

Commit

Permalink
obsolete faster by speeding up low-mem versions
Browse files Browse the repository at this point in the history
  • Loading branch information
tromp committed Oct 16, 2016
1 parent c2a84f9 commit f06ff12
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 112 deletions.
11 changes: 11 additions & 0 deletions .gitignore
@@ -0,0 +1,11 @@
equi
equi1
equi1g
faster
faster1
equi965
equi1445
eqcuda
eqcuda1445
feqcuda
verify
4 changes: 1 addition & 3 deletions LICENSE.txt
Expand Up @@ -3,9 +3,7 @@ The MIT License (MIT)
Copyright (c) 2016 John Tromp

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software, EXCEPT FOR blake2b.cu WHICH ORIGINATES FROM
https://github.com/tpruvot/ccminer/blob/windows/sia/sia.cu,
and associated documentation files (the "Software"), to deal
of this software, and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
Expand Down
20 changes: 7 additions & 13 deletions Makefile
Expand Up @@ -2,23 +2,17 @@ OPT = -O3
FLAGS = -Wall -Wno-deprecated-declarations -D_POSIX_C_SOURCE=200112L $(OPT) -pthread
GPP = g++ -march=native -m64 -std=c++11 $(FLAGS)

all: equi equi1 faster faster1 verify test spark
all: equi equi1 verify test spark

equi: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi

equi1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DSPARK equi_miner.cpp blake/blake2b.cpp -o equi1
$(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1

equi1g: equi.h equi_miner.h equi_miner.cpp Makefile
g++ -g -DSPARK equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g

faster: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DJOINHT -DATOMIC equi_miner.cpp blake/blake2b.cpp -o faster

faster1: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DJOINHT equi_miner.cpp blake/blake2b.cpp -o faster1

equi965: equi.h equi_miner.h equi_miner.cpp Makefile
$(GPP) -DWN=96 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o equi965

Expand All @@ -37,14 +31,14 @@ feqcuda: equi_miner.cu equi.h blake2b.cu Makefile
verify: equi.h equi.c Makefile
g++ -g equi.c blake/blake2b.cpp -o verify

bench: equi
time for i in {0..9}; do ./faster -n $$i; done
bench: equi1
time ./equi1 -n 1000 -r 100

test: equi verify Makefile
time ./equi -h "" -n 0 -t 1 -s | grep ^Sol | ./verify -h "" -n 0

spark: equi1
time ./equi1
spark: equi1g
time ./equi1g

clean:
rm equi equi1 equi1g faster faster1 equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
rm equi equi1 equi1g equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
2 changes: 2 additions & 0 deletions blake2b.cu
@@ -1,5 +1,7 @@
// Blake2-B CUDA Implementation
// tpruvot@github July 2016
// permission granted to use under MIT license
// modified for use in Zcash by John Tromp September 2016

/**
* uint2 direct ops by c++ operator definitions
Expand Down
3 changes: 2 additions & 1 deletion equi_miner.cpp
Expand Up @@ -43,10 +43,11 @@ int main(int argc, char **argv) {
printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
if (range > 1)
printf("-%d", nonce+range-1);
printf(") with %d %d-bits digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
assert(threads);
equi eq(nthreads);
printf("Using %dMB of memory\n", eq.hta.alloced >> 20);
u32 sumnsols = 0;
for (int r = 0; r < range; r++) {
eq.setnonce(header, nonce+r);
Expand Down
137 changes: 42 additions & 95 deletions equi_miner.h
Expand Up @@ -109,87 +109,52 @@ u32 htunits(u32 bytes) {
return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
}

#ifdef JOINHT
u32 slotsize(const u32 r) {
return 1 + htunits(hashsize(r));
}
// size (in htunits) of bucket in round 0 <= r < WK
u32 bucketsize(const u32 r) {
return NSLOTS * slotsize(r);
}
#else
u32 slotsize(const u32 r) {
return 1;
}
#endif

// manages hash and tree data
struct htalloc {
// Defining JOINHT joins each tree with its corresponding hash,
// so they may share a cache line. This gives a small speed
// advantage but comes at the cost of a big memory increase
// as hash-space can no longer be reclaimed
#ifdef JOINHT
htunit *trees[WK];
#else
bucket *trees[WK];
htunit *hashes[WK];
#endif
u64 alloced;
u32 alloced;
htalloc() {
alloced = 0;
}
void alloctrees() {
#ifdef JOINHT
for (int r=0; r<WK; r++)
trees[r] = (htunit *)alloc(NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
#else
// optimize xenoncat's fixed memory layout, avoiding any waste
// digit trees hashes trees
// 0 0 A A A A A A . . . . . .
// 1 0 A A A A A A B B B B B 1
// 2 0 2 C C C C C B B B B B 1
// 3 0 2 C C C C C D D D D 3 1
// 4 0 2 4 E E E E D D D D 3 1
// 5 0 2 4 E E E E F F F 5 3 1
// 6 0 2 4 6 . G G F F F 5 3 1
// 7 0 2 4 6 . G G H H 7 5 3 1
// 8 0 2 4 6 8 . I H H 7 5 3 1
// digit trees hashes trees hashes
// 0 0 A A A A A A . . . . . .
// 1 0 A A A A A A 1 B B B B B
// 2 0 2 C C C C C 1 B B B B B
// 3 0 2 C C C C C 1 3 D D D D
// 4 0 2 4 E E E E 1 3 D D D D
// 5 0 2 4 E E E E 1 3 5 F F F
// 6 0 2 4 6 . G G 1 3 5 F F F
// 7 0 2 4 6 . G G 1 3 5 7 H H
// 8 0 2 4 6 8 . I 1 3 5 7 H H
assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits
u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
digit *heap = (digit *)alloc(1+units0+units1+1, sizeof(digit));
for (int r=0; r<WK; r++) {
trees[r] = (bucket *)(heap + (r&1 ? 1+units0+units1-r/2 : r/2));
hashes[r] = (htunit *)(heap + (r&1 ? 1+units0 : 1+r/2));
}
#endif
digit *heap[2];
for (u32 i =0; i < 2; i++)
heap[i] = (digit *)alloc(1 + htunits(hashsize(i)), sizeof(digit));
for (int r=0; r<WK; r++)
trees[r] = (htunit *)heap[r&1] + r/2;
}
void dealloctrees() {
#ifdef JOINHT
for (int r=0; r<WK; r++)
dealloc(trees[r], NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
#else
u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
dealloc(trees[0], 1+units0+units1+1, sizeof(digit));
#endif
for (u32 i =0; i < 2; i++)
free(trees[i]);
}
u32 slotsize(const u32 r) const {
return 1 + htunits(hashsize(r&1));
}
// size (in htunits) of bucket in round 0 <= r < WK
u32 bucketsize(const u32 r) const {
return NSLOTS * slotsize(r);
}
htunit *getbucket(u32 r, u32 bid) const {
#ifdef JOINHT
return &trees[r][bid * bucketsize(r)];
#else
return trees[r][bid];
#endif
}
void *alloc(const u32 n, const u32 sz) {
void *mem = calloc(n, sz);
assert(mem);
alloced += (u64)n * sz;
alloced += n * sz;
return mem;
}
void dealloc(void *mem, const u32 n, const u32 sz) {
free(mem);
alloced -= (u64)n * sz;
}
};

typedef au32 bsizes[NBUCKETS];
Expand Down Expand Up @@ -249,8 +214,8 @@ struct equi {
const htunit *bt = hta.getbucket(--r,t.bucketid);
const u32 size = 1 << r;
u32 *indices1 = indices + size;
listindices(r, bt[t.slotid0 * slotsize(r)].attr, indices);
listindices(r, bt[t.slotid1 * slotsize(r)].attr, indices1);
listindices(r, bt[t.slotid0 * hta.slotsize(r)].attr, indices);
listindices(r, bt[t.slotid1 * hta.slotsize(r)].attr, indices1);
if (*indices > *indices1) {
for (u32 i=0; i < size; i++) {
const u32 tmp = indices[i];
Expand Down Expand Up @@ -292,73 +257,55 @@ struct equi {
printf("\342\226%c", '\201'+bsizes[i]/SPARKSCALE);
#endif
}
printf(" %ld MB\n", hta.alloced >> 20);
printf("\n");
#endif
}

struct htlayout {
htalloc hta;
u32 prevhtunits;
u32 nexthtunits;
u32 prevslotunits;
u32 nextslotunits;
u32 dunits;
u32 prevbo;
u32 nextbo;
htunit *buck;
htunit *hashbase;

htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), prevslotunits(0), dunits(0) {
u32 nexthashbytes = hashsize(r);
nexthtunits = htunits(nexthashbytes);
nextslotunits = 1 + htunits(hashsize(r&1));
prevbo = 0;
nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3
if (r) {
u32 prevhashbytes = hashsize(r-1);
prevhtunits = htunits(prevhashbytes);
prevslotunits = 1 + htunits(hashsize((r-1)&1));
prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
dunits = prevhtunits - nexthtunits;
}
#ifdef JOINHT
nexthtunits++;
prevhtunits++;
#endif
}
void setbucket(u32 r, u32 bid) {
buck = hta.getbucket(r, bid);
#ifdef JOINHT
hashbase = buck + 1;
#else
hashbase = hta.hashes[r] + (bid * NSLOTS) * prevhtunits;
#endif
}
u32 getxhash(const u32 slot, const htunit *hash) const {
#ifdef XWITHASH
return hash->bytes[prevbo] & 0xf;
#elif defined JOINHT
return buck[slot * prevhtunits].attr.xhash;
#else
return buck[slot].attr.xhash;
#endif
}
u32 prevhashunits() const {
#ifdef JOINHT
return prevhtunits - 1;
#else
return prevhtunits;
return buck[slot * prevslotunits].attr.xhash;
#endif
}
bool equal(const htunit *hash0, const htunit *hash1) const {
return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash;
return hash0[prevhtunits-1].hash == hash1[prevhtunits-1].hash;
}
htunit *addtree(u32 r, tree t, u32 bid, u32 slot) {
htunit *buck = hta.getbucket(r,bid);
#ifdef JOINHT
htunit *slotree = buck + slot * nexthtunits;
htunit *slotree = buck + slot * nextslotunits;
slotree->attr = t;
return slotree + 1;
#else
buck[slot].attr = t;
return hta.hashes[r] + (bid * NSLOTS + slot) * nexthtunits;
#endif
}
};

Expand Down Expand Up @@ -467,14 +414,14 @@ struct equi {
htl.setbucket(r-1, bucketid);
u32 bsize = getnslots(r-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1))) {
xfull++;
continue;
}
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
if (htl.equal(hash0, hash1)) {
hfull++;
continue;
Expand Down Expand Up @@ -511,7 +458,7 @@ struct equi {
xort.xhash = xhash;
#endif
htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot);
for (u32 i=htl.dunits; i < htl.prevhashunits(); i++)
for (u32 i=htl.dunits; i < htl.prevhtunits; i++)
xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash;
}
}
Expand All @@ -526,12 +473,12 @@ struct equi {
htl.setbucket(WK-1, bucketid);
u32 bsize = getnslots(WK-1, bucketid);
for (u32 s1 = 0; s1 < bsize; s1++) {
const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
continue;
for (; cd.nextcollision(); ) {
const u32 s0 = cd.slot();
const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
if (htl.equal(hash0, hash1)) {
tree xort; xort.bucketid = bucketid;
xort.slotid0 = s0; xort.slotid1 = s1;
Expand Down

0 comments on commit f06ff12

Please sign in to comment.