Skip to content

Commit

Permalink
more memory reduction
Browse files Browse the repository at this point in the history
  • Loading branch information
tromp committed Jan 22, 2019
1 parent 7d69e76 commit 22f7a78
Showing 1 changed file with 163 additions and 74 deletions.
237 changes: 163 additions & 74 deletions src/cuckatoo/mean.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,16 @@ const u32 EDGES_B = NZ * NEPS_B / NEPS;
const u32 ROW_EDGES_A = EDGES_A * NY;
const u32 ROW_EDGES_B = EDGES_B * NY;

// Number of Parts of BufferB, all but one of which will overlap BufferA
#ifndef NB
#define NB 2
// Number of rows in bufferB not overlapping bufferA
#ifndef NRB1
#define NRB1 (NX / 2)
#endif
#define NRB2 (NX - NRB1)
#define NB 2

// number of equal sized (each smaller than non-ovlp) parts in which to safely move bufferAB to bufferA,
#ifndef NA
#define NA ((NB * NEPS_A + NEPS_B-1) / NEPS_B)
#define NA ((NEPS_A * NX + EPS_B * NRB1 - 1) / (NEPS_B * NRB1))
#endif

__constant__ uint2 recoveredges[PROOFSIZE];
Expand Down Expand Up @@ -258,7 +261,7 @@ const u32 NONPART_BITS = ZBITS - PART_BITS;
const word_t NONPART_MASK = (1 << NONPART_BITS) - 1;
const int BITMAPBYTES = (NZ >> PART_BITS) / 8;

template<int NP, int maxIn, typename EdgeIn, int maxOut, typename EdgeOut>
template<int maxIn, typename EdgeIn, int maxOut, typename EdgeOut>
__global__ void Round(const int round, const int part, const siphash_keys &sipkeys, EdgeIn * __restrict__ src, EdgeOut * __restrict__ dst, u32 * __restrict__ srcIdx, u32 * __restrict__ dstIdx) {
const int group = blockIdx.x;
const int dim = blockDim.x;
Expand All @@ -270,43 +273,121 @@ __global__ void Round(const int round, const int part, const siphash_keys &sipke
for (int i = lid; i < BITMAPWORDS; i += dim)
ebitmap[i] = 0;
__syncthreads();
for (int i = 0; i < NP; i++, src += NX2 * maxIn, srcIdx += NX2) {
const int edgesInBucket = min(srcIdx[group], maxIn);
// if (!group && !lid) printf("round %d size %d\n", round, edgesInBucket);
const int loops = (edgesInBucket + dim-1) / dim;

for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn * group + lindex;
EdgeIn edge = __ldg(&src[index]);
if (null(edge)) continue;
u32 z = endpoint(sipkeys, edge, round&1) & ZMASK;
if ((z >> NONPART_BITS) == part) {
bitmapset(ebitmap, z & NONPART_MASK);
}
const int edgesInBucket = min(srcIdx[group], maxIn);
// if (!group && !lid) printf("round %d size %d\n", round, edgesInBucket);
const int loops = (edgesInBucket + dim-1) / dim;

for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn * group + lindex;
EdgeIn edge = __ldg(&src[index]);
if (null(edge)) continue;
u32 z = endpoint(sipkeys, edge, round&1) & ZMASK;
if ((z >> NONPART_BITS) == part) {
bitmapset(ebitmap, z & NONPART_MASK);
}
}
}
__syncthreads();
src -= NP * NX2 * maxIn; srcIdx -= NP * NX2;
for (int i = 0; i < NP; i++, src += NX2 * maxIn, srcIdx += NX2) {
const int edgesInBucket = min(srcIdx[group], maxIn);
const int loops = (edgesInBucket + dim-1) / dim;
for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn * group + lindex;
EdgeIn edge = __ldg(&src[index]);
if (null(edge)) continue;
u32 node0 = endpoint(sipkeys, edge, round&1);
u32 z = node0 & ZMASK;
if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
u32 node1 = endpoint(sipkeys, edge, (round&1)^1);
const int bucket = node1 >> ZBITS;
const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
dst[bucket * maxOut + bktIdx] = (round&1) ? make_Edge(edge, *dst, node1, node0) : make_Edge(edge, *dst, node0, node1);
}
const int edgesInBucket = min(srcIdx[group], maxIn);
const int loops = (edgesInBucket + dim-1) / dim;
for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn * group + lindex;
EdgeIn edge = __ldg(&src[index]);
if (null(edge)) continue;
u32 node0 = endpoint(sipkeys, edge, round&1);
u32 z = node0 & ZMASK;
if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
u32 node1 = endpoint(sipkeys, edge, (round&1)^1);
const int bucket = node1 >> ZBITS;
const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
dst[bucket * maxOut + bktIdx] = (round&1) ? make_Edge(edge, *dst, node1, node0) : make_Edge(edge, *dst, node0, node1);
}
}
}
}

template<int maxIn0, int maxIn1, typename EdgeIn, int maxOut, typename EdgeOut>
__global__ void Round2(const int round, const int part, const siphash_keys &sipkeys, EdgeIn * __restrict__ src, EdgeOut * __restrict__ dst, u32 * __restrict__ srcIdx, u32 * __restrict__ dstIdx) {
const int group = blockIdx.x;
const int dim = blockDim.x;
const int lid = threadIdx.x;
const int BITMAPWORDS = BITMAPBYTES / sizeof(u32);

extern __shared__ u32 ebitmap[];

for (int i = lid; i < BITMAPWORDS; i += dim)
ebitmap[i] = 0;
__syncthreads();

const int edgesInBucket = min(srcIdx[group], maxIn0);
// if (!group && !lid) printf("round %d size %d\n", round, edgesInBucket);
const int loops = (edgesInBucket + dim-1) / dim;
for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn0 * group + lindex;
EdgeIn edge = __ldg(&src[index]);
if (null(edge)) continue;
u32 z = endpoint(sipkeys, edge, round&1) & ZMASK;
if ((z >> NONPART_BITS) == part) {
bitmapset(ebitmap, z & NONPART_MASK);
}
}
}
const int edgesInBucket = min(srcIdx[NX2 + group], maxIn1);
// if (!group && !lid) printf("round %d size %d\n", round, edgesInBucket);
const int loops = (edgesInBucket + dim-1) / dim;
for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn1 * group + lindex;
EdgeIn edge = __ldg(&src[NX2*maxIn0 + index]);
if (null(edge)) continue;
u32 z = endpoint(sipkeys, edge, round&1) & ZMASK;
if ((z >> NONPART_BITS) == part) {
bitmapset(ebitmap, z & NONPART_MASK);
}
}
}
__syncthreads();

const int edgesInBucket = min(srcIdx[group], maxIn0);
const int loops = (edgesInBucket + dim-1) / dim;
for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn0 * group + lindex;
EdgeIn edge = __ldg(&src[index]);
if (null(edge)) continue;
u32 node0 = endpoint(sipkeys, edge, round&1);
u32 z = node0 & ZMASK;
if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
u32 node1 = endpoint(sipkeys, edge, (round&1)^1);
const int bucket = node1 >> ZBITS;
const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
dst[bucket * maxOut + bktIdx] = (round&1) ? make_Edge(edge, *dst, node1, node0) : make_Edge(edge, *dst, node0, node1);
}
}
}
const int edgesInBucket = min(srcIdx[NX2 + group], maxIn1);
const int loops = (edgesInBucket + dim-1) / dim;
for (int loop = 0; loop < loops; loop++) {
const int lindex = loop * dim + lid;
if (lindex < edgesInBucket) {
const int index = maxIn1 * group + lindex;
EdgeIn edge = __ldg(&src[NX2*maxIn0 + index]);
if (null(edge)) continue;
u32 node0 = endpoint(sipkeys, edge, round&1);
u32 z = node0 & ZMASK;
if ((z >> NONPART_BITS) == part && bitmaptest(ebitmap, (z & NONPART_MASK) ^ 1)) {
u32 node1 = endpoint(sipkeys, edge, (round&1)^1);
const int bucket = node1 >> ZBITS;
const int bktIdx = min(atomicAdd(dstIdx + bucket, 1), maxOut - 1);
dst[bucket * maxOut + bktIdx] = (round&1) ? make_Edge(edge, *dst, node1, node0) : make_Edge(edge, *dst, node0, node1);
}
}
}
Expand Down Expand Up @@ -422,25 +503,27 @@ struct edgetrimmer {
for (int i = 0; i < 1+NB; i++) {
checkCudaErrors_V(cudaMalloc((void**)&indexesE[i], indexesSize));
}
sizeA = ROW_EDGES_A * NX * (tp.expand > 0 ? sizeof(u32) : sizeof(uint2));
sizeB = ROW_EDGES_B * NX * (tp.expand > 1 ? sizeof(u32) : sizeof(uint2));
const size_t bufferSize = sizeA + sizeB / NB;
assert(bufferSize >= sizeB + sizeB / NB / (tp.expand == 1 ? 1 : 2)); // ensure enough space for Round 1
sizeA = ROW_EDGES_A * NX * (tp.expand ? sizeof(u32) : sizeof(uint2));
sizeB = ROW_EDGES_B * NX * (tp.expand ? sizeof(u32) : sizeof(uint2));
const size_t nonoverlap = sizeB * NRB1 / NX;
const size_t bufferSize = sizeA + nonoverlap;
assert(bufferSize - sizeB >= sizeB / 2); // ensure enough space for Round 1, / 2 is for 0.296 / 0.632
checkCudaErrors_V(cudaMalloc((void**)&bufferA, bufferSize));
bufferAB = bufferA + sizeB / NB;
bufferAB = bufferA + nonoverlap;
bufferB = bufferA + bufferSize - sizeB;
assert(bufferA + sizeA == bufferB + sizeB * (NB-1) / NB); // ensure alignment of overlap
assert(bufferA + sizeA * NRB2 / NX <= bufferB); // ensure disjoint source dest in 2nd phase of round 0
assert(bufferA + sizeA == bufferB + sizeB * NRB2 / NX); // ensure alignment of overlap
cudaMemcpy(dt, this, sizeof(edgetrimmer), cudaMemcpyHostToDevice);
initsuccess = true;
int maxbytes = 0x10000; // 64 KB
cudaFuncSetAttribute(Round<1, EDGES_A, uint2, EDGES_B/NB, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<1, EDGES_A, u32, EDGES_B/NB, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<1, EDGES_A, u32, EDGES_B/NB, u32>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<NB, EDGES_B/NB, uint2, EDGES_B/2, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<NB, EDGES_B/NB, u32, EDGES_B/2, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<1, EDGES_B/2, uint2, EDGES_A/4, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<1, EDGES_A/4, uint2, EDGES_B/4, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<1, EDGES_B/4, uint2, EDGES_B/4, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<EDGES_A, uint2, EDGES_B/NB, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<EDGES_A, u32, EDGES_B/NB, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<EDGES_A, u32, EDGES_B/NB, u32>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round2<EDGES_B*NRB2/NX, EDGES_B*NRB1/NX, uint2, EDGES_B/2, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round2<EDGES_B*NRB2/NX, EDGES_B*NRB1/NX, u32, EDGES_B/2, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<EDGES_B/2, uint2, EDGES_A/4, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<EDGES_A/4, uint2, EDGES_B/4, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
cudaFuncSetAttribute(Round<EDGES_B/4, uint2, EDGES_B/4, uint2>, cudaFuncAttributeMaxDynamicSharedMemorySize, maxbytes);
}
u64 globalbytes() const {
return (sizeA+sizeB/NB) + (1+NB) * indexesSize + sizeof(siphash_keys) + PROOFSIZE * 2*sizeof(u32) + sizeof(edgetrimmer);
Expand Down Expand Up @@ -497,46 +580,52 @@ struct edgetrimmer {
print_log("Seeding completed in %.0f + %.0f ms\n", durationA, durationB);
if (abort) return false;

for (u32 i = 0; i < NB; i++) cudaMemset(indexesE[1+i], 0, indexesSize);
cudaMemset(indexesE[2], 0, indexesSize);

qA = sizeA/NB;
const size_t qB = sizeB/NB;
qE = NX2 / NB;
for (u32 i = NB; i--; ) {
for (u32 part = 0; part <= PART_MASK; part++) {
if (tp.expand == 0) {
Round<1, EDGES_A, uint2, EDGES_B/NB, uint2><<<tp.trim.blocks/NB, tp.trim.tpb, BITMAPBYTES>>>(0, part, *dipkeys, (uint2*)(bufferA+i*qA), (uint2*)(bufferB+i*qB), indexesE[0]+i*qE, indexesE[1+i]); // to .632
} else if (tp.expand == 1) {
Round<1, EDGES_A, u32, EDGES_B/NB, uint2><<<tp.trim.blocks/NB, tp.trim.tpb, BITMAPBYTES>>>(0, part, *dipkeys, (u32*)(bufferA+i*qA), (uint2*)(bufferB+i*qB), indexesE[0]+i*qE, indexesE[1+i]); // to .632
} else { // tp.expand == 2
Round<1, EDGES_A, u32, EDGES_B/NB, u32><<<tp.trim.blocks/NB, tp.trim.tpb, BITMAPBYTES>>>(0, part, *dipkeys, (u32*)(bufferA+i*qA), (u32*)(bufferB+i*qB), indexesE[0]+i*qE, indexesE[1+i]); // to .632
}
if (abort) return false;
qA = sizeA * NRB2 / NX;
qE = NX * NRB2;
for (u32 part = 0; part <= PART_MASK; part++) {
if (tp.expand == 0) {
Round<EDGES_A, uint2, EDGES_B*NRB1/NX, uint2><<<tp.trim.blocks*NRB1/NX, tp.trim.tpb, BITMAPBYTES>>>(0, part, *dipkeys, (uint2*)(bufferA+qA), (uint2*)(bufferA+sizeA), indexesE[0]+qE, indexesE[2]); // to .632
} else { // tp.expand == 2
Round<EDGES_A, u32, EDGES_B*NRB1/NX, u32><<<tp.trim.blocks*NRB1/NX, tp.trim.tpb, BITMAPBYTES>>>(0, part, *dipkeys, (u32*)(bufferA+qA), (u32*)(bufferA+sizeA), indexesE[0]+qE, indexesE[2]); // to .632
}
if (abort) return false;
}

cudaMemset(indexesE[1], 0, indexesSize);

for (u32 part = 0; part <= PART_MASK; part++) {
if (tp.expand == 0) {
Round<EDGES_A, uint2, EDGES_B*NRB2/NX, uint2><<<tp.trim.blocks*NRB2/NX, tp.trim.tpb, BITMAPBYTES>>>(0, part, *dipkeys, (uint2*)bufferA, (uint2*)bufferB, indexesE[0], indexesE[1]); // to .632
} else { // tp.expand == 2
Round<EDGES_A, u32, EDGES_B*NRB2/NX, u32><<<tp.trim.blocks*NRB2/NX, tp.trim.tpb, BITMAPBYTES>>>(0, part, *dipkeys, (u32*)bufferA, (u32*)bufferB, indexesE[0], indexesE[1]); // to .632
}
if (abort) return false;
}

cudaMemset(indexesE[0], 0, indexesSize);

for (u32 part = 0; part <= PART_MASK; part++) {
if (tp.expand < 2) {
Round<NB, EDGES_B/NB, uint2, EDGES_B/2, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(1, part, *dipkeys, (uint2*)bufferB, (uint2*)bufferA, indexesE[1], indexesE[0]); // to .296
Round2<EDGES_B*NRB2/NX, EDGES_B*NRB1/NX, uint2, EDGES_B/2, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(1, part, *dipkeys, (uint2*)bufferB, (uint2*)bufferA, indexesE[1], indexesE[0]); // to .296
} else {
Round<NB, EDGES_B/NB, u32, EDGES_B/2, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(1, part, *dipkeys, ( u32*)bufferB, (uint2*)bufferA, indexesE[1], indexesE[0]); // to .296
Round2<EDGES_B*NRB2/NX, EDGES_B*NRB1/NX, u32, EDGES_B/2, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(1, part, *dipkeys, ( u32*)bufferB, (uint2*)bufferA, indexesE[1], indexesE[0]); // to .296
}
if (abort) return false;
}

cudaMemset(indexesE[1], 0, indexesSize);

for (u32 part = 0; part <= PART_MASK; part++) {
Round<1, EDGES_B/2, uint2, EDGES_A/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(2, part, *dipkeys, (uint2 *)bufferA, (uint2 *)bufferB, indexesE[0], indexesE[1]); // to .176
Round<EDGES_B/2, uint2, EDGES_A/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(2, part, *dipkeys, (uint2 *)bufferA, (uint2 *)bufferB, indexesE[0], indexesE[1]); // to .176
if (abort) return false;
}

cudaMemset(indexesE[0], 0, indexesSize);

for (u32 part = 0; part <= PART_MASK; part++) {
Round<1, EDGES_A/4, uint2, EDGES_B/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(3, part, *dipkeys, (uint2 *)bufferB, (uint2 *)bufferA, indexesE[1], indexesE[0]); // to .117
Round<EDGES_A/4, uint2, EDGES_B/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(3, part, *dipkeys, (uint2 *)bufferB, (uint2 *)bufferA, indexesE[1], indexesE[0]); // to .117
if (abort) return false;
}

Expand All @@ -545,12 +634,12 @@ struct edgetrimmer {
for (int round = 4; round < tp.ntrims; round += 2) {
cudaMemset(indexesE[1], 0, indexesSize);
for (u32 part = 0; part <= PART_MASK; part++) {
Round<1, EDGES_B/4, uint2, EDGES_B/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(round , part, *dipkeys, (uint2 *)bufferA, (uint2 *)bufferB, indexesE[0], indexesE[1]);
Round<EDGES_B/4, uint2, EDGES_B/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(round , part, *dipkeys, (uint2 *)bufferA, (uint2 *)bufferB, indexesE[0], indexesE[1]);
if (abort) return false;
}
cudaMemset(indexesE[0], 0, indexesSize);
for (u32 part = 0; part <= PART_MASK; part++) {
Round<1, EDGES_B/4, uint2, EDGES_B/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(round+1, part, *dipkeys, (uint2 *)bufferB, (uint2 *)bufferA, indexesE[1], indexesE[0]);
Round<EDGES_B/4, uint2, EDGES_B/4, uint2><<<tp.trim.blocks, tp.trim.tpb, BITMAPBYTES>>>(round+1, part, *dipkeys, (uint2 *)bufferB, (uint2 *)bufferA, indexesE[1], indexesE[0]);
if (abort) return false;
}
}
Expand Down Expand Up @@ -799,7 +888,7 @@ int main(int argc, char **argv) {
while ((c = getopt(argc, argv, "scb:d:E:h:k:m:n:r:U:u:v:w:y:Z:z:")) != -1) {
switch (c) {
case 's':
print_log("SYNOPSIS\n cuda%d [-s] [-c] [-d device] [-E 0-2] [-h hexheader] [-m trims] [-n nonce] [-r range] [-U seedAblocks] [-u seedAthreads] [-v seedBthreads] [-w Trimthreads] [-y Tailthreads] [-Z recoverblocks] [-z recoverthreads]\n", NODEBITS);
print_log("SYNOPSIS\n cuda%d [-s] [-c] [-d device] [-E 0/2] [-h hexheader] [-m trims] [-n nonce] [-r range] [-U seedAblocks] [-u seedAthreads] [-v seedBthreads] [-w Trimthreads] [-y Tailthreads] [-Z recoverblocks] [-z recoverthreads]\n", NODEBITS);
print_log("DEFAULTS\n cuda%d -d %d -E %d -h \"\" -m %d -n %d -r %d -U %d -u %d -v %d -w %d -y %d -Z %d -z %d\n", NODEBITS, device, tp.expand, tp.ntrims, nonce, range, tp.genA.blocks, tp.genA.tpb, tp.genB.tpb, tp.trim.tpb, tp.tail.tpb, tp.recover.blocks, tp.recover.tpb);
exit(0);
case 'c':
Expand All @@ -810,7 +899,7 @@ int main(int argc, char **argv) {
break;
case 'E':
params.expand = atoi(optarg);
assert(params.expand <= 2);
assert(params.expand = 0 || params.expand = 2);
break;
case 'h':
len = strlen(optarg)/2;
Expand Down

0 comments on commit 22f7a78

Please sign in to comment.