Skip to content

Commit

Permalink
fix sync bug
Browse files Browse the repository at this point in the history
  • Loading branch information
tromp committed Dec 5, 2018
1 parent 291515a commit 609dde2
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 35 deletions.
5 changes: 1 addition & 4 deletions src/cuckaroo/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,7 @@ mean30x1: cuckaroo.hpp bitmap.hpp graph.hpp ../threads/barrier.hpp ../crypto/si
$(GPP) -o $@ -DNSIPHASH=1 -DEXPANDROUND=10 -DCOMPRESSROUND=22 -DEDGEBITS=30 mean.cpp $(BLAKE_2B_SRC)

cuda19: ../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile
$(NVCC) -o $@ -DSYNCBUG -DPROOFSIZE=2 -DEPS_A=4 -DEPS_B=3 -DIDXSHIFT=2 -DEDGEBITS=19 -arch sm_35 mean.cu $(BLAKE_2B_SRC)

cudabug: cuda19
./cuda19 -U 64 -Z 64 -z 64 -m 4
$(NVCC) -o $@ -DEPS_A=4 -DEPS_B=3 -DIDXSHIFT=2 -DEDGEBITS=19 -arch sm_35 mean.cu $(BLAKE_2B_SRC)

cuda29: ../crypto/siphash.cuh compress.hpp graph.hpp mean.cu Makefile
$(NVCC) -o $@ -DEDGEBITS=29 -arch sm_35 mean.cu $(BLAKE_2B_SRC)
11 changes: 6 additions & 5 deletions src/cuckaroo/mean.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,12 @@ __global__ void SeedB(const uint2 * __restrict__ source, ulonglong4 * __restrict
if (edgeIndex < bucketEdges) {
const int index = group * maxOut + edgeIndex;
uint2 edge = __ldg(&source[index]);
if (null(edge)) continue;
u32 node1 = edge.x;
col = (node1 >> ZBITS) & XMASK;
counter = min((int)atomicAdd(counters + col, 1), (int)(FLUSHB2-1)); // assuming COLS_LIMIT_LOSSES checked
tmp[col][counter] = edge;
if (!null(edge)) {
u32 node1 = edge.x;
col = (node1 >> ZBITS) & XMASK;
counter = min((int)atomicAdd(counters + col, 1), (int)(FLUSHB2-1)); // assuming COLS_LIMIT_LOSSES checked
tmp[col][counter] = edge;
}
}
__syncthreads();
if (counter == FLUSHB-1) {
Expand Down
31 changes: 18 additions & 13 deletions src/cuckatoo/mean.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,10 @@ __global__ void SeedA(const siphash_keys &sipkeys, ulonglong4 * __restrict__ buf
int localIdx = min(FLUSHA2, counters[row]);
int newCount = localIdx % FLUSHA;
int nflush = localIdx - newCount;
int cnt = min((int)atomicAdd(indexes + row * NX + col, nflush), (int)(maxOut - nflush));
u32 grp = row * NX + col;
int cnt = min((int)atomicAdd(indexes + grp, nflush), (int)(maxOut - nflush));
for (int i = 0; i < nflush; i += TMPPERLL4) {
buffer[((u64)(row * NX + col) * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
buffer[((u64)grp * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
}
for (int t = 0; t < newCount; t++) {
tmp[row][t] = tmp[row][t + nflush];
Expand All @@ -120,11 +121,12 @@ __global__ void SeedA(const siphash_keys &sipkeys, ulonglong4 * __restrict__ buf
EdgeOut zero = make_Edge(0, tmp[0][0], 0, 0);
for (int row = lid; row < NX; row += dim) {
int localIdx = min(FLUSHA2, counters[row]);
u32 grp = row * NX + col;
for (int j = localIdx; j % TMPPERLL4; j++)
tmp[row][j] = zero;
for (int i = 0; i < localIdx; i += TMPPERLL4) {
int cnt = min((int)atomicAdd(indexes + row * NX + col, TMPPERLL4), (int)(maxOut - TMPPERLL4));
buffer[((u64)(row * NX + col) * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
int cnt = min((int)atomicAdd(indexes + grp, TMPPERLL4), (int)(maxOut - TMPPERLL4));
buffer[((u64)grp * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
}
}
}
Expand Down Expand Up @@ -167,20 +169,22 @@ __global__ void SeedB(const siphash_keys &sipkeys, const EdgeOut * __restrict__
if (edgeIndex < bucketEdges) {
const int index = group * maxOut + edgeIndex;
EdgeOut edge = __ldg(&source[index]);
if (null(edge)) continue;
u32 node0 = endpoint(sipkeys, edge, 0);
col = (node0 >> ZBITS) & XMASK;
counter = min((int)atomicAdd(counters + col, 1), (int)(FLUSHB2-1)); // assuming COLS_LIMIT_LOSSES checked
tmp[col][counter] = edge;
if (!null(edge)) {
u32 node0 = endpoint(sipkeys, edge, 0);
col = (node0 >> ZBITS) & XMASK;
counter = min((int)atomicAdd(counters + col, 1), (int)(FLUSHB2-1)); // assuming COLS_LIMIT_LOSSES checked
tmp[col][counter] = edge;
}
}
__syncthreads();
if (counter == FLUSHB-1) {
int localIdx = min(FLUSHB2, counters[col]);
int newCount = localIdx % FLUSHB;
int nflush = localIdx - newCount;
int cnt = min((int)atomicAdd(destinationIndexes + row * NX + col, nflush), (int)(maxOut - nflush));
u32 grp = row * NX + col;
int cnt = min((int)atomicAdd(destinationIndexes + grp, nflush), (int)(maxOut - nflush));
for (int i = 0; i < nflush; i += TMPPERLL4)
destination[((u64)(row * NX + col) * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
destination[((u64)grp * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
for (int t = 0; t < newCount; t++) {
tmp[col][t] = tmp[col][t + nflush];
}
Expand All @@ -191,11 +195,12 @@ __global__ void SeedB(const siphash_keys &sipkeys, const EdgeOut * __restrict__
EdgeOut zero = make_Edge(0, tmp[0][0], 0, 0);
for (int col = lid; col < NX; col += dim) {
int localIdx = min(FLUSHB2, counters[col]);
u32 grp = row * NX + col;
for (int j = localIdx; j % TMPPERLL4; j++)
tmp[col][j] = zero;
for (int i = 0; i < localIdx; i += TMPPERLL4) {
int cnt = min((int)atomicAdd(destinationIndexes + row * NX + col, TMPPERLL4), (int)(maxOut - TMPPERLL4));
destination[((u64)(row * NX + col) * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
int cnt = min((int)atomicAdd(destinationIndexes + grp, TMPPERLL4), (int)(maxOut - TMPPERLL4));
destination[((u64)grp * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
}
}
}
Expand Down
31 changes: 18 additions & 13 deletions src/cuckoo/mean.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,10 @@ __global__ void SeedA(const siphash_keys &sipkeys, ulonglong4 * __restrict__ buf
int localIdx = min(FLUSHA2, counters[row]);
int newCount = localIdx % FLUSHA;
int nflush = localIdx - newCount;
int cnt = min((int)atomicAdd(indexes + row * NX + col, nflush), (int)(maxOut - nflush));
u32 grp = row * NX + col;
int cnt = min((int)atomicAdd(indexes + grp, nflush), (int)(maxOut - nflush));
for (int i = 0; i < nflush; i += TMPPERLL4)
buffer[((u64)(row * NX + col) * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
buffer[((u64)grp * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
for (int t = 0; t < newCount; t++) {
tmp[row][t] = tmp[row][t + nflush];
}
Expand All @@ -101,11 +102,12 @@ __global__ void SeedA(const siphash_keys &sipkeys, ulonglong4 * __restrict__ buf
EdgeOut zero = make_Edge(0, tmp[0][0], 0, 0);
for (int row = lid; row < NX; row += dim) {
int localIdx = min(FLUSHA2, counters[row]);
u32 grp = row * NX + col;
for (int j = localIdx; j % TMPPERLL4; j++)
tmp[row][j] = zero;
for (int i = 0; i < localIdx; i += TMPPERLL4) {
int cnt = min((int)atomicAdd(indexes + row * NX + col, TMPPERLL4), (int)(maxOut - TMPPERLL4));
buffer[((u64)(row * NX + col) * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
int cnt = min((int)atomicAdd(indexes + grp, TMPPERLL4), (int)(maxOut - TMPPERLL4));
buffer[((u64)grp * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[row][i]);
}
}
}
Expand Down Expand Up @@ -149,20 +151,22 @@ __global__ void SeedB(const siphash_keys &sipkeys, const EdgeOut * __restrict__
if (edgeIndex < bucketEdges) {
const int index = group * maxOut + edgeIndex;
EdgeOut edge = __ldg(&source[index]);
if (null(edge)) continue;
u32 node1 = endpoint(sipkeys, edge, 0);
col = (node1 >> ZBITS) & XMASK;
counter = min((int)atomicAdd(counters + col, 1), (int)(FLUSHB2-1));
tmp[col][counter] = edge;
if (!null(edge)) {
u32 node1 = endpoint(sipkeys, edge, 0);
col = (node1 >> ZBITS) & XMASK;
counter = min((int)atomicAdd(counters + col, 1), (int)(FLUSHB2-1));
tmp[col][counter] = edge;
}
}
__syncthreads();
if (counter == FLUSHB-1) {
int localIdx = min(FLUSHB2, counters[col]);
int newCount = localIdx % FLUSHB;
int nflush = localIdx - newCount;
int cnt = min((int)atomicAdd(destinationIndexes + row * NX + col, nflush), (int)(maxOut - nflush));
u32 grp = row * NX + col;
int cnt = min((int)atomicAdd(destinationIndexes + grp, nflush), (int)(maxOut - nflush));
for (int i = 0; i < nflush; i += TMPPERLL4)
destination[((u64)(row * NX + col) * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
destination[((u64)grp * maxOut + cnt + i) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
for (int t = 0; t < newCount; t++) {
tmp[col][t] = tmp[col][t + nflush];
}
Expand All @@ -173,11 +177,12 @@ __global__ void SeedB(const siphash_keys &sipkeys, const EdgeOut * __restrict__
EdgeOut zero = make_Edge(0, tmp[0][0], 0, 0);
for (int col = lid; col < NX; col += dim) {
int localIdx = min(FLUSHB2, counters[col]);
u32 grp = row * NX + col;
for (int j = localIdx; j % TMPPERLL4; j++)
tmp[col][j] = zero;
for (int i = 0; i < localIdx; i += TMPPERLL4) {
int cnt = min((int)atomicAdd(destinationIndexes + row * NX + col, TMPPERLL4), (int)(maxOut - TMPPERLL4));
destination[((u64)(row * NX + col) * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
int cnt = min((int)atomicAdd(destinationIndexes + grp, TMPPERLL4), (int)(maxOut - TMPPERLL4));
destination[((u64)grp * maxOut + cnt) / TMPPERLL4] = *(ulonglong4 *)(&tmp[col][i]);
}
}
}
Expand Down

0 comments on commit 609dde2

Please sign in to comment.