Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions vortex-cuda/kernels/src/dynamic_dispatch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,8 @@ __device__ void execute_output_stage(T *__restrict output,
char *__restrict smem,
uint64_t block_start,
uint32_t block_len) {
constexpr uint32_t VALUES_PER_TILE = 32 / sizeof(T);
// Cap at 4 values per thread per tile to minimise register pressure.
constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4;
const uint32_t tile_size = blockDim.x * VALUES_PER_TILE;
const auto &src = stage.source;
const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
Expand Down Expand Up @@ -472,9 +473,10 @@ dynamic_dispatch(T *__restrict output, uint64_t array_len, const uint8_t *__rest
// matters is load_element(), which dispatches on the per-op PTypeTag to
// sign-extend or zero-extend when widening a narrow source to T.
#define GENERATE_KERNEL(suffix, Type) \
extern "C" __global__ void dynamic_dispatch_##suffix(Type *__restrict output, \
uint64_t array_len, \
const uint8_t *__restrict packed_plan) { \
extern "C" __global__ void __launch_bounds__(BLOCK_SIZE, 32) \
dynamic_dispatch_##suffix(Type *__restrict output, \
uint64_t array_len, \
const uint8_t *__restrict packed_plan) { \
dynamic_dispatch<Type>(output, array_len, packed_plan); \
}

Expand Down
Loading