obsolete faster by speeding up low-mem versions

tromp · Oct 16, 2016 · f06ff12 · f06ff12
1 parent c2a84f9
commit f06ff12
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 112 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+equi
+equi1
+equi1g
+faster
+faster1
+equi965
+equi1445
+eqcuda
+eqcuda1445
+feqcuda
+verify
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -3,9 +3,7 @@ The MIT License (MIT)
 Copyright (c) 2016 John Tromp
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software, EXCEPT FOR blake2b.cu WHICH ORIGINATES FROM
-https://github.com/tpruvot/ccminer/blob/windows/sia/sia.cu,
-and associated documentation files (the "Software"), to deal
+of this software, and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is

diff --git a/Makefile b/Makefile
@@ -2,23 +2,17 @@ OPT   = -O3
 FLAGS = -Wall -Wno-deprecated-declarations -D_POSIX_C_SOURCE=200112L $(OPT) -pthread 
 GPP   = g++ -march=native -m64 -std=c++11 $(FLAGS)
 
-all:	equi equi1 faster faster1 verify test spark
+all:	equi equi1 verify test spark
 
 equi:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DATOMIC equi_miner.cpp blake/blake2b.cpp -o equi
 
 equi1:	equi.h equi_miner.h equi_miner.cpp Makefile
-	$(GPP) -DSPARK equi_miner.cpp blake/blake2b.cpp -o equi1
+	$(GPP) equi_miner.cpp blake/blake2b.cpp -o equi1
 
 equi1g:	equi.h equi_miner.h equi_miner.cpp Makefile
 	g++ -g -DSPARK equi_miner.cpp blake/blake2b.cpp -pthread -o equi1g
 
-faster:	equi.h equi_miner.h equi_miner.cpp Makefile
-	$(GPP) -DJOINHT -DATOMIC equi_miner.cpp blake/blake2b.cpp -o faster
-
-faster1:	equi.h equi_miner.h equi_miner.cpp Makefile
-	$(GPP) -DJOINHT equi_miner.cpp blake/blake2b.cpp -o faster1
-
 equi965:	equi.h equi_miner.h equi_miner.cpp Makefile
 	$(GPP) -DWN=96 -DWK=5 equi_miner.cpp blake/blake2b.cpp -o equi965
 
@@ -37,14 +31,14 @@ feqcuda:	equi_miner.cu equi.h blake2b.cu Makefile
 verify:	equi.h equi.c Makefile
 	g++ -g equi.c blake/blake2b.cpp -o verify
 
-bench:	equi
-	time for i in {0..9}; do ./faster -n $$i; done
+bench:	equi1
+	time ./equi1 -n 1000 -r 100
 
 test:	equi verify Makefile
 	time ./equi -h "" -n 0 -t 1 -s | grep ^Sol | ./verify -h "" -n 0
 
-spark:	equi1
-	time ./equi1
+spark:	equi1g
+	time ./equi1g
 
 clean:	
-	rm equi equi1 equi1g faster faster1 equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
+	rm equi equi1 equi1g equi965 equi1445 eqcuda eqcuda1445 feqcuda verify
diff --git a/blake2b.cu b/blake2b.cu
@@ -1,5 +1,7 @@
 // Blake2-B CUDA Implementation
 // tpruvot@github July 2016
+// permission granted to use under MIT license
+// modified for use in Zcash by John Tromp September 2016
 
 /**
  * uint2 direct ops by c++ operator definitions

diff --git a/equi_miner.cpp b/equi_miner.cpp
@@ -43,10 +43,11 @@ int main(int argc, char **argv) {
   printf("Looking for wagner-tree on (\"%s\",%d", header, nonce);
   if (range > 1)
     printf("-%d", nonce+range-1);
-  printf(") with %d %d-bits digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
+  printf(") with %d %d-bit digits and %d threads\n", NDIGITS, DIGITBITS, nthreads);
   thread_ctx *threads = (thread_ctx *)calloc(nthreads, sizeof(thread_ctx));
   assert(threads);
   equi eq(nthreads);
+  printf("Using %dMB of memory\n", eq.hta.alloced >> 20);
   u32 sumnsols = 0;
   for (int r = 0; r < range; r++) {
     eq.setnonce(header, nonce+r);

diff --git a/equi_miner.h b/equi_miner.h
@@ -109,87 +109,52 @@ u32 htunits(u32 bytes) {
   return (bytes + sizeof(htunit) - 1) / sizeof(htunit);
 }
 
-#ifdef JOINHT
-u32 slotsize(const u32 r) {
-  return 1 + htunits(hashsize(r));
-}
-// size (in htunits) of bucket in round 0 <= r < WK
-u32 bucketsize(const u32 r) {
-  return NSLOTS * slotsize(r);
-}
-#else
-u32 slotsize(const u32 r) {
-  return 1;
-}
-#endif
-
 // manages hash and tree data
 struct htalloc {
-// Defining JOINHT joins each tree with its corresponding hash,
-// so they may share a cache line. This gives a small speed
-// advantage but comes at the cost of a big memory increase
-// as hash-space can no longer be reclaimed
-#ifdef JOINHT
   htunit *trees[WK];
-#else
-  bucket *trees[WK];
-  htunit *hashes[WK];
-#endif
-  u64 alloced;
+  u32 alloced;
   htalloc() {
     alloced = 0;
   }
   void alloctrees() {
-#ifdef JOINHT
-    for (int r=0; r<WK; r++)
-      trees[r] = (htunit *)alloc(NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
-#else
 // optimize xenoncat's fixed memory layout, avoiding any waste
-// digit trees         hashes        trees
-// 0         0 A A A A A A . . . . . .
-// 1         0 A A A A A A B B B B B 1
-// 2         0 2 C C C C C B B B B B 1
-// 3         0 2 C C C C C D D D D 3 1
-// 4         0 2 4 E E E E D D D D 3 1
-// 5         0 2 4 E E E E F F F 5 3 1
-// 6         0 2 4 6 . G G F F F 5 3 1
-// 7         0 2 4 6 . G G H H 7 5 3 1
-// 8         0 2 4 6 8 . I H H 7 5 3 1
+// digit  trees  hashes  trees hashes
+// 0      0 A A A A A A   . . . . . .
+// 1      0 A A A A A A   1 B B B B B
+// 2      0 2 C C C C C   1 B B B B B
+// 3      0 2 C C C C C   1 3 D D D D
+// 4      0 2 4 E E E E   1 3 D D D D
+// 5      0 2 4 E E E E   1 3 5 F F F
+// 6      0 2 4 6 . G G   1 3 5 F F F
+// 7      0 2 4 6 . G G   1 3 5 7 H H
+// 8      0 2 4 6 8 . I   1 3 5 7 H H
     assert(DIGITBITS >= 16); // ensures hashes shorten by 1 unit every 2 digits
-    u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
-    digit *heap = (digit *)alloc(1+units0+units1+1, sizeof(digit));
-    for (int r=0; r<WK; r++) {
-      trees[r]  = (bucket *)(heap + (r&1 ? 1+units0+units1-r/2 :   r/2));
-      hashes[r] = (htunit *)(heap + (r&1 ? 1+units0            : 1+r/2));
-    }
-#endif
+    digit *heap[2];
+    for (u32 i =0; i < 2; i++)
+      heap[i] = (digit *)alloc(1 + htunits(hashsize(i)), sizeof(digit));
+    for (int r=0; r<WK; r++)
+      trees[r]  = (htunit *)heap[r&1] + r/2;
   }
   void dealloctrees() {
-#ifdef JOINHT
-    for (int r=0; r<WK; r++)
-      dealloc(trees[r], NBUCKETS * NSLOTS * (1 + htunits(hashsize(r))), sizeof(htunit));
-#else
-    u32 units0 = htunits(hashsize(0)), units1 = htunits(hashsize(1));
-    dealloc(trees[0], 1+units0+units1+1, sizeof(digit));
-#endif
+    for (u32 i =0; i < 2; i++)
+      free(trees[i]);
+  }
+  u32 slotsize(const u32 r) const {
+    return 1 + htunits(hashsize(r&1));
+  }
+  // size (in htunits) of bucket in round 0 <= r < WK
+  u32 bucketsize(const u32 r) const {
+    return NSLOTS * slotsize(r);
   }
   htunit *getbucket(u32 r, u32 bid) const {
-#ifdef JOINHT
     return &trees[r][bid * bucketsize(r)];
-#else
-    return trees[r][bid];
-#endif
   }
   void *alloc(const u32 n, const u32 sz) {
     void *mem  = calloc(n, sz);
     assert(mem);
-    alloced += (u64)n * sz;
+    alloced += n * sz;
     return mem;
   }
-  void dealloc(void *mem, const u32 n, const u32 sz) {
-    free(mem);
-    alloced -= (u64)n * sz;
-  }
 };
 
 typedef au32 bsizes[NBUCKETS];
@@ -249,8 +214,8 @@ struct equi {
     const htunit *bt = hta.getbucket(--r,t.bucketid);
     const u32 size = 1 << r;
     u32 *indices1 = indices + size;
-    listindices(r, bt[t.slotid0 * slotsize(r)].attr, indices);
-    listindices(r, bt[t.slotid1 * slotsize(r)].attr, indices1);
+    listindices(r, bt[t.slotid0 * hta.slotsize(r)].attr, indices);
+    listindices(r, bt[t.slotid1 * hta.slotsize(r)].attr, indices1);
     if (*indices > *indices1) {
       for (u32 i=0; i < size; i++) {
         const u32 tmp = indices[i];
@@ -292,73 +257,55 @@ struct equi {
       printf("\342\226%c", '\201'+bsizes[i]/SPARKSCALE);
 #endif
     }
-    printf(" %ld MB\n", hta.alloced >> 20);
+    printf("\n");
 #endif
   }
 
   struct htlayout {
     htalloc hta;
     u32 prevhtunits;
     u32 nexthtunits;
+    u32 prevslotunits;
+    u32 nextslotunits;
     u32 dunits;
     u32 prevbo;
     u32 nextbo;
     htunit *buck;
     htunit *hashbase;
 
-    htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), dunits(0) {
+    htlayout(equi *eq, u32 r): hta(eq->hta), prevhtunits(0), prevslotunits(0), dunits(0) {
       u32 nexthashbytes = hashsize(r);
       nexthtunits = htunits(nexthashbytes);
+      nextslotunits = 1 + htunits(hashsize(r&1));
       prevbo = 0;
       nextbo = nexthtunits * sizeof(htunit) - nexthashbytes; // 0-3
       if (r) {
         u32 prevhashbytes = hashsize(r-1);
         prevhtunits = htunits(prevhashbytes);
+        prevslotunits = 1 + htunits(hashsize((r-1)&1));
         prevbo = prevhtunits * sizeof(htunit) - prevhashbytes; // 0-3
         dunits = prevhtunits - nexthtunits;
       }
-#ifdef JOINHT
-      nexthtunits++;
-      prevhtunits++;
-#endif
     }
     void setbucket(u32 r, u32 bid) {
       buck = hta.getbucket(r, bid);
-#ifdef JOINHT
       hashbase = buck + 1;
-#else
-      hashbase = hta.hashes[r] + (bid * NSLOTS) * prevhtunits;
-#endif
     }
     u32 getxhash(const u32 slot, const htunit *hash) const {
 #ifdef XWITHASH
       return hash->bytes[prevbo] & 0xf;
-#elif defined JOINHT
-      return buck[slot * prevhtunits].attr.xhash;
 #else
-      return buck[slot].attr.xhash;
-#endif
-    }
-    u32 prevhashunits() const {
-#ifdef JOINHT
-      return prevhtunits - 1;
-#else
-      return prevhtunits;
+      return buck[slot * prevslotunits].attr.xhash;
 #endif
     }
     bool equal(const htunit *hash0, const htunit *hash1) const {
-      return hash0[prevhashunits()-1].hash == hash1[prevhashunits()-1].hash;
+      return hash0[prevhtunits-1].hash == hash1[prevhtunits-1].hash;
     }
     htunit *addtree(u32 r, tree t, u32 bid, u32 slot) {
       htunit *buck = hta.getbucket(r,bid);
-#ifdef JOINHT
-      htunit *slotree = buck + slot * nexthtunits;
+      htunit *slotree = buck + slot * nextslotunits;
       slotree->attr = t;
       return slotree + 1;
-#else
-      buck[slot].attr = t;
-      return hta.hashes[r] + (bid * NSLOTS + slot) * nexthtunits;
-#endif
     }
   };
 
@@ -467,14 +414,14 @@ struct equi {
       htl.setbucket(r-1, bucketid);
       u32 bsize = getnslots(r-1, bucketid);
       for (u32 s1 = 0; s1 < bsize; s1++) {
-        const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
+        const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
         if (!cd.addslot(s1, htl.getxhash(s1, hash1))) {
           xfull++;
           continue;
         }
         for (; cd.nextcollision(); ) {
           const u32 s0 = cd.slot();
-          const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
+          const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
           if (htl.equal(hash0, hash1)) {
             hfull++;
             continue;
@@ -511,7 +458,7 @@ struct equi {
           xort.xhash = xhash;
 #endif
           htunit *xorhash = htl.addtree(r, xort, xorbucketid, xorslot);
-          for (u32 i=htl.dunits; i < htl.prevhashunits(); i++)
+          for (u32 i=htl.dunits; i < htl.prevhtunits; i++)
             xorhash[i-htl.dunits].hash = hash0[i].hash ^ hash1[i].hash;
         }
       }
@@ -526,12 +473,12 @@ struct equi {
       htl.setbucket(WK-1, bucketid);
       u32 bsize = getnslots(WK-1, bucketid);
       for (u32 s1 = 0; s1 < bsize; s1++) {
-        const htunit *hash1 = htl.hashbase + s1 * htl.prevhtunits;
+        const htunit *hash1 = htl.hashbase + s1 * htl.prevslotunits;
         if (!cd.addslot(s1, htl.getxhash(s1, hash1)))
           continue;
         for (; cd.nextcollision(); ) {
           const u32 s0 = cd.slot();
-          const htunit *hash0 = htl.hashbase + s0 * htl.prevhtunits;
+          const htunit *hash0 = htl.hashbase + s0 * htl.prevslotunits;
           if (htl.equal(hash0, hash1)) {
             tree xort; xort.bucketid = bucketid;
             xort.slotid0 = s0; xort.slotid1 = s1;