Skip to content

Commit

Permalink
MachSuite: update benchmarks to DMA interface v3.
Browse files Browse the repository at this point in the history
This requires allocating separate host/accelerator arrays, copying the
contents of bench_args_t into the host arrays and finally the result
back into the bench_args_t. We also use calloc() to zero out the
accelerator memory storing the output when possible, so we don't need to
dmaLoad a bunch of zeros. Finally, some arrays are no longer mapped to
the accelerator if they don't ever participate in host-to-accelerator
communication.

Follow-up commits can change the dmaLoad() calls to the helper methods
to pipeline DMA transactions in page sized chunks.

BUG=issue #15

TESTED=manually ran all benchmarks in native/tracer mode and gem5 and
verified output was correct.

Change-Id: Ieef084d77c8a5341b296b3eac66ec628a1b0236f
  • Loading branch information
xyzsam committed Mar 24, 2020
1 parent 9d3b262 commit e28b131
Show file tree
Hide file tree
Showing 57 changed files with 742 additions and 445 deletions.
11 changes: 7 additions & 4 deletions MachSuite/aes/aes/aes.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,18 @@ uint8_t aes_expandEncKey(uint8_t *k, uint8_t rc)
} /* aes_expandEncKey */

/* -------------------------------------------------------------------------- */
void aes256_encrypt_ecb(aes256_context *ctx, uint8_t k[32], uint8_t buf[16])
void aes256_encrypt_ecb(
aes256_context *host_ctx, uint8_t* host_k, uint8_t* host_buf,
aes256_context *ctx, uint8_t* k, uint8_t* buf)
{
//INIT
uint8_t rcon = 1;
uint8_t i;

#ifdef DMA_MODE
dmaLoad(&k[0], 0, 32 * sizeof(uint8_t));
dmaLoad(&buf[0], 0, 16 * sizeof(uint8_t));
dmaLoad(k, host_k, 32 * sizeof(uint8_t));
dmaLoad(buf, host_buf, 16 * sizeof(uint8_t));
dmaLoad(ctx, host_ctx, sizeof(aes256_context));
#endif

ecb1 : for (i = 0; i < sizeof(ctx->key); i++){
Expand Down Expand Up @@ -215,7 +218,7 @@ void aes256_encrypt_ecb(aes256_context *ctx, uint8_t k[32], uint8_t buf[16])
rcon = aes_expandEncKey(ctx->key, rcon);
aes_addRoundKey(buf, ctx->key);
#ifdef DMA_MODE
dmaStore(&buf[0], 0, 16 * sizeof(uint8_t));
dmaStore(host_buf, buf, 16 * sizeof(uint8_t));
#endif
} /* aes256_encrypt */

4 changes: 3 additions & 1 deletion MachSuite/aes/aes/aes.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ typedef struct {
uint8_t deckey[32];
} aes256_context;

void aes256_encrypt_ecb(aes256_context *ctx, uint8_t k[32], uint8_t buf[16]);
void aes256_encrypt_ecb(
aes256_context *host_ctx, uint8_t* host_k, uint8_t* host_buf,
aes256_context *ctx, uint8_t* k, uint8_t* buf);

////////////////////////////////////////////////////////////////////////////////
// Test harness interface code.
Expand Down
24 changes: 20 additions & 4 deletions MachSuite/aes/aes/local_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,33 @@ int INPUT_SIZE = sizeof(struct bench_args_t);

void run_benchmark( void *vargs ) {
struct bench_args_t *args = (struct bench_args_t *)vargs;
// Copy args into separate host arrays.
aes256_context* host_ctx = malloc_aligned_memcpy(&args->ctx, sizeof(args->ctx));
uint8_t* host_k = malloc_aligned_memcpy(&args->k, sizeof(args->k));
uint8_t* host_buf = malloc_aligned_memcpy(&args->buf, sizeof(args->buf));
// Allocate memory for the accelerator arrays.
aes256_context* accel_ctx = malloc_aligned(sizeof(args->ctx));
uint8_t* accel_k = malloc_aligned(sizeof(args->k));
uint8_t* accel_buf = malloc_aligned(sizeof(args->buf));
#ifdef GEM5_HARNESS
mapArrayToAccelerator(
MACHSUITE_AES_AES, "ctx", (void*)&args->ctx, sizeof(args->ctx));
MACHSUITE_AES_AES, "host_ctx", host_ctx, sizeof(args->ctx));
mapArrayToAccelerator(
MACHSUITE_AES_AES, "k", (void*)&args->k, sizeof(args->k));
MACHSUITE_AES_AES, "host_k", host_k, sizeof(args->k));
mapArrayToAccelerator(
MACHSUITE_AES_AES, "buf", (void*)&args->buf, sizeof(args->buf));
MACHSUITE_AES_AES, "host_buf", host_buf, sizeof(args->buf));
invokeAcceleratorAndBlock(MACHSUITE_AES_AES);
#else
aes256_encrypt_ecb( &(args->ctx), args->k, args->buf );
aes256_encrypt_ecb(host_ctx, host_k, host_buf,
accel_ctx, accel_k, accel_buf);
#endif
memcpy(&args->buf, host_buf, sizeof(args->buf));
free(host_ctx);
free(host_k);
free(host_buf);
free(accel_ctx);
free(accel_k);
free(accel_buf);
}

/* Input format:
Expand Down
30 changes: 15 additions & 15 deletions MachSuite/bfs/bulk/bfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,29 @@ Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core CP
#include "gem5/dma_interface.h"
#endif

void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
node_index_t starting_node, level_t level[N_NODES],
edge_index_t level_counts[N_LEVELS])
void bfs(node_t* host_nodes,
edge_t* host_edges,
level_t* host_level,
edge_index_t* host_level_counts,
node_t* nodes,
edge_t* edges,
level_t* level,
edge_index_t* level_counts,
node_index_t starting_node)
{
node_index_t n;
edge_index_t e;
level_t horizon;
edge_index_t cnt;

#ifdef DMA_MODE
dmaLoad(&level[0], 0, N_NODES * sizeof(level_t));
dmaLoad(&nodes[0], 0, N_NODES * sizeof(node_t));
dmaLoad(&edges[0], 0 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 1 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 2 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 3 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 4 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 5 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 6 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 7 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(nodes, host_nodes, N_NODES * sizeof(node_t));
dmaLoad(edges, host_edges, N_EDGES * sizeof(edge_t));
dmaLoad(level, host_level, N_NODES * sizeof(level_t));
#endif

level[starting_node] = 0;
init_horizons: for( i=0; i<N_LEVELS; i++ )
level_counts[i] = 0;
level_counts[0] = 1;

loop_horizons: for( horizon=0; horizon<N_LEVELS; horizon++ ) {
Expand All @@ -57,6 +57,6 @@ void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
break;
}
#ifdef DMA_MODE
dmaStore(&level[0], 0, N_NODES * sizeof(level_t));
dmaStore(host_level_counts, level_counts, N_LEVELS * sizeof(edge_index_t));
#endif
}
11 changes: 9 additions & 2 deletions MachSuite/bfs/bulk/bfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,12 @@ struct bench_args_t {
edge_index_t level_counts[N_LEVELS];
};

void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES], node_index_t starting_node, level_t level[N_NODES], edge_index_t level_counts[N_LEVELS]);

void bfs(node_t* host_nodes,
edge_t* host_edges,
level_t* host_level,
edge_index_t* host_level_counts,
node_t* nodes,
edge_t* edges,
level_t* level,
edge_index_t* level_counts,
node_index_t starting_node);
33 changes: 24 additions & 9 deletions MachSuite/bfs/bulk/local_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,38 @@ int INPUT_SIZE = sizeof(struct bench_args_t);

void run_benchmark( void *vargs ) {
struct bench_args_t *args = (struct bench_args_t *)vargs;
node_t* host_nodes = malloc_aligned_memcpy(&args->nodes, sizeof(args->nodes));
edge_t* host_edges = malloc_aligned_memcpy(&args->edges, sizeof(args->edges));
level_t* host_level = malloc_aligned_memcpy(&args->level, sizeof(args->level));
edge_index_t* host_level_counts = calloc_aligned(sizeof(args->level_counts));
node_t* accel_nodes = malloc_aligned(sizeof(args->nodes));
edge_t* accel_edges = malloc_aligned(sizeof(args->edges));
level_t* accel_level = malloc_aligned(sizeof(args->level));
edge_index_t* accel_level_counts = calloc_aligned(sizeof(args->level_counts));
#ifdef GEM5_HARNESS
mapArrayToAccelerator(
MACHSUITE_BFS_BULK, "nodes", (void*)&args->nodes, sizeof(args->nodes));
MACHSUITE_BFS_BULK, "host_nodes", host_nodes, sizeof(args->nodes));
mapArrayToAccelerator(
MACHSUITE_BFS_BULK, "edges", (void*)&args->edges, sizeof(args->edges));
MACHSUITE_BFS_BULK, "host_edges", host_edges, sizeof(args->edges));
mapArrayToAccelerator(
MACHSUITE_BFS_BULK, "starting_node", (void*)&args->starting_node,
sizeof(args->starting_node));
MACHSUITE_BFS_BULK, "host_level", host_level, sizeof(args->level));
mapArrayToAccelerator(
MACHSUITE_BFS_BULK, "level", (void*)&args->level, sizeof(args->level));
mapArrayToAccelerator(
MACHSUITE_BFS_BULK, "level_counts", (void*)&args->level_counts,
sizeof(args->level_counts));
MACHSUITE_BFS_BULK, "host_level_counts", host_level_counts, sizeof(args->level_counts));
invokeAcceleratorAndBlock(MACHSUITE_BFS_BULK);
#else
bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
bfs(host_nodes, host_edges, host_level, host_level_counts,
accel_nodes, accel_edges, accel_level, accel_level_counts,
args->starting_node);
#endif
memcpy(&args->level_counts, host_level_counts, sizeof(args->level_counts));
free(host_nodes);
free(host_edges);
free(host_level);
free(host_level_counts);
free(accel_nodes);
free(accel_edges);
free(accel_level);
free(accel_level_counts);
}

/* Input format:
Expand Down
27 changes: 13 additions & 14 deletions MachSuite/bfs/queue/bfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,15 @@ Hong, Oguntebi, Olukotun. "Efficient Parallel Graph Exploration on Multi-Core CP
#define Q_POP() { q_out = (q_out+1)%N_NODES; }
#define Q_EMPTY() (q_in>q_out ? q_in==q_out+1 : (q_in==0)&&(q_out==N_NODES-1))

void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
node_index_t starting_node, level_t level[N_NODES],
edge_index_t level_counts[N_LEVELS])
void bfs(node_t* host_nodes,
edge_t* host_edges,
level_t* host_level,
edge_index_t* host_level_counts,
node_t* nodes,
edge_t* edges,
level_t* level,
edge_index_t* level_counts,
node_index_t starting_node)
{
node_index_t queue[N_NODES];
node_index_t q_in, q_out;
Expand All @@ -26,16 +32,9 @@ void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
unsigned i;

#ifdef DMA_MODE
dmaLoad(&level[0], 0, N_NODES * sizeof(level_t));
dmaLoad(&nodes[0], 0, N_NODES * sizeof(node_t));
dmaLoad(&edges[0], 0 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 1 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 2 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 3 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 4 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 5 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 6 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(&edges[0], 7 * 512 * sizeof(edge_t), PAGE_SIZE);
dmaLoad(nodes, host_nodes, N_NODES * sizeof(node_t));
dmaLoad(edges, host_edges, N_EDGES * sizeof(edge_t));
dmaLoad(level, host_level, N_NODES * sizeof(level_t));
#endif

/*init_levels: for( n=0; n<N_NODES; n++ )*/
Expand Down Expand Up @@ -76,6 +75,6 @@ void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES],
printf("\n");
*/
#ifdef DMA_MODE
dmaStore(&level[0], 0, N_NODES * sizeof(level_t));
dmaStore(host_level_counts, level_counts, N_LEVELS * sizeof(edge_index_t));
#endif
}
10 changes: 9 additions & 1 deletion MachSuite/bfs/queue/bfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,12 @@ struct bench_args_t {
edge_index_t level_counts[N_LEVELS];
};

void bfs(node_t nodes[N_NODES], edge_t edges[N_EDGES], node_index_t starting_node, level_t level[N_NODES], edge_index_t level_counts[N_LEVELS]);
void bfs(node_t* host_nodes,
edge_t* host_edges,
level_t* host_level,
edge_index_t* host_level_counts,
node_t* nodes,
edge_t* edges,
level_t* level,
edge_index_t* level_counts,
node_index_t starting_node);
33 changes: 24 additions & 9 deletions MachSuite/bfs/queue/local_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,38 @@ int INPUT_SIZE = sizeof(struct bench_args_t);

void run_benchmark( void *vargs ) {
struct bench_args_t *args = (struct bench_args_t *)vargs;
node_t* host_nodes = malloc_aligned_memcpy(&args->nodes, sizeof(args->nodes));
edge_t* host_edges = malloc_aligned_memcpy(&args->edges, sizeof(args->edges));
level_t* host_level = malloc_aligned_memcpy(&args->level, sizeof(args->level));
edge_index_t* host_level_counts = calloc_aligned(sizeof(args->level_counts));
node_t* accel_nodes = malloc_aligned(sizeof(args->nodes));
edge_t* accel_edges = malloc_aligned(sizeof(args->edges));
level_t* accel_level = malloc_aligned(sizeof(args->level));
edge_index_t* accel_level_counts = calloc_aligned(sizeof(args->level_counts));
#ifdef GEM5_HARNESS
mapArrayToAccelerator(
MACHSUITE_BFS_QUEUE, "nodes", (void*)&args->nodes, sizeof(args->nodes));
MACHSUITE_BFS_QUEUE, "host_nodes", host_nodes, sizeof(args->nodes));
mapArrayToAccelerator(
MACHSUITE_BFS_QUEUE, "edges", (void*)&args->edges, sizeof(args->edges));
MACHSUITE_BFS_QUEUE, "host_edges", host_edges, sizeof(args->edges));
mapArrayToAccelerator(
MACHSUITE_BFS_QUEUE, "starting_node", (void*)&args->starting_node,
sizeof(args->starting_node));
MACHSUITE_BFS_QUEUE, "host_level", host_level, sizeof(args->level));
mapArrayToAccelerator(
MACHSUITE_BFS_QUEUE, "level", (void*)&args->level, sizeof(args->level));
mapArrayToAccelerator(
MACHSUITE_BFS_QUEUE, "level_counts", (void*)&args->level_counts,
sizeof(args->level_counts));
MACHSUITE_BFS_QUEUE, "host_level_counts", host_level_counts, sizeof(args->level_counts));
invokeAcceleratorAndBlock(MACHSUITE_BFS_QUEUE);
#else
bfs(args->nodes, args->edges, args->starting_node, args->level, args->level_counts);
bfs(host_nodes, host_edges, host_level, host_level_counts,
accel_nodes, accel_edges, accel_level, accel_level_counts,
args->starting_node);
#endif
memcpy(&args->level_counts, host_level_counts, sizeof(args->level_counts));
free(host_nodes);
free(host_edges);
free(host_level);
free(host_level_counts);
free(accel_nodes);
free(accel_edges);
free(accel_level);
free(accel_level_counts);
}

/* Input format:
Expand Down
6 changes: 4 additions & 2 deletions MachSuite/common/Makefile.gem5
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
# The gem5 dependencies are listed as .cpp files for compatibility with gem5.
# To compile them with C code, we need to explicitly use gcc.

.PHONY: gem5 gem5-cpu gem5-accel clean-gem5
.PHONY: all gem5 gem5-cpu gem5-accel clean-gem5

GEM5_SRCS = aladdin_sys_connection.cpp aladdin_sys_constants.cpp dma_interface.c
GEM5_FULL_SRCS = $(GEM5_SRCS:%=$(ALADDIN_HOME)/gem5/%)

# For the MachSuite harness and support system.
CFLAGS = -O3 -Wall -Wno-unused-label -I../../common -DDMA_INTERFACE_V3 -DDMA_MODE
CFLAGS = -O3 -Wall -Wno-unused-label -I../../common -I$(ALADDIN_HOME) -DDMA_INTERFACE_V3 -DDMA_MODE
ALL_SRCS = $(SRCS) ../../common/harness.c

GEM5_CFLAGS = -static -O3
Expand All @@ -19,6 +19,8 @@ INCLUDES += -I$(ALADDIN_HOME)
LFLAGS = -lm
BMARK = $(ACCEL_NAME)

all: gem5

# Builds both standalone CPU version and the HW accelerated version.
gem5: gem5-cpu gem5-accel

Expand Down
12 changes: 12 additions & 0 deletions MachSuite/common/support.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ void* malloc_aligned(size_t size) {
return ptr;
}

void* calloc_aligned(size_t size) {
void* ptr = malloc_aligned(size);
memset(ptr, 0, size);
return ptr;
}

void* malloc_aligned_memcpy(void* src, size_t size) {
void* dest = malloc_aligned(size);
memcpy(dest, src, size);
return dest;
}

size_t next_multiple(size_t request, size_t align) {
size_t n = request / align;
if (n == 0)
Expand Down
4 changes: 4 additions & 0 deletions MachSuite/common/support.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
#define CACHELINE_SIZE 64

void* malloc_aligned(size_t size);
void* calloc_aligned(size_t size);
// Allocates new aligned memory and copies src (which may be unaligned) into
// it.
void* malloc_aligned_memcpy(void* src, size_t size);
size_t next_multiple(size_t request, size_t align);

///// File and section functions
Expand Down
Loading

0 comments on commit e28b131

Please sign in to comment.