Skip to content

Commit

Permalink
Eliminated a memcpy in sorting, and added thrust_sort_96b, a 96-bit l…
Browse files Browse the repository at this point in the history
…exicographical sort entirely in thrust. Slightly faster in my tests on my laptop. TODO: test on faster GPU.
  • Loading branch information
harrism committed Apr 24, 2012
1 parent adf7901 commit 3569148
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 17 deletions.
64 changes: 63 additions & 1 deletion runtime/CUDAkernels/support_kernels.cu
Expand Up @@ -10,7 +10,9 @@
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <thrust/gather.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/transform_iterator.h>

#include "../include/my_cuda_rt.h"

Expand All @@ -30,6 +32,66 @@
valuesOutput.copy(srcValues, N);
}

template <int keyIdx>
struct ExtractBits : public thrust::unary_function<uint4, uint>
{
__host__ __device__
uint operator()(uint4 key) const {
if(keyIdx == 0)
return key.x;
else if(keyIdx == 1)
return key.y;
else
return key.z;
}
};

template <int keyIdx, typename KeyPtr, typename PermutationPtr, typename ExtractedPtr>
void update_permutation(KeyPtr& keys, PermutationPtr& permutation, ExtractedPtr& temp, int N)
{
// permute the keys with the current reordering
thrust::gather(permutation, permutation + N,
thrust::make_transform_iterator(keys, ExtractBits<keyIdx>()), temp);

// stable_sort the permuted keys and update the permutation
thrust::stable_sort_by_key(temp, temp + N, permutation);
}


template <typename KeyPtr, typename PermutationPtr, typename OutputPtr>
void apply_permutation(KeyPtr& keys, PermutationPtr& permutation, OutputPtr& out, int N)
{
// permute the keys into out vector
thrust::gather(permutation, permutation + N, keys, out);
}

extern "C" void thrust_sort_96b(my_dev::dev_mem<uint4> &srcKeys,
my_dev::dev_mem<uint4> &sortedKeys,
my_dev::dev_mem<uint> &temp_buffer,
my_dev::dev_mem<uint> &permutation_buffer,
int N)
{

// wrap raw pointer with a device_ptr
thrust::device_ptr<uint4> keys = thrust::device_pointer_cast(srcKeys.raw_p());
thrust::device_ptr<uint4> outKeys = thrust::device_pointer_cast(sortedKeys.raw_p());
thrust::device_ptr<uint> temp = thrust::device_pointer_cast(temp_buffer.raw_p());
thrust::device_ptr<uint> permutation = thrust::device_pointer_cast(permutation_buffer.raw_p());

// initialize permutation to [0, 1, 2, ... ,N-1]
thrust::sequence(permutation, permutation + N);

// sort z, y, x
update_permutation<2>(keys, permutation, temp, N);
update_permutation<1>(keys, permutation, temp, N);
update_permutation<0>(keys, permutation, temp, N);

// Note: keys have not been modified
// Note: permutation now maps unsorted keys to sorted order

thrust::gather(permutation, permutation + N, keys, outKeys);
}

#endif


Expand Down
33 changes: 31 additions & 2 deletions runtime/src/load_kernels.cpp
@@ -1,14 +1,19 @@
#include "octree.h"

// #define USE_THRUST

// #define USE_THRUST_96

#ifdef USE_THRUST
extern "C" void thrust_sort_32b(my_dev::context &devContext,
my_dev::dev_mem<uint> &srcKeys, my_dev::dev_mem<uint> &srcValues,
my_dev::dev_mem<int> &keysOutput, my_dev::dev_mem<uint> &keysAPing,
my_dev::dev_mem<uint> &valuesOutput,my_dev::dev_mem<uint> &valuesAPing,
int N, int numberOfBits);
extern "C" void thrust_sort_96b(my_dev::dev_mem<uint4> &srcKeys,
my_dev::dev_mem<uint4> &sortedKeys,
my_dev::dev_mem<uint> &temp_buffer,
my_dev::dev_mem<uint> &permutation_buffer,
int N);
extern "C" void thrust_gpuCompact(my_dev::context &devContext,
my_dev::dev_mem<uint> &srcValues,
my_dev::dev_mem<uint> &output,
Expand Down Expand Up @@ -497,6 +502,30 @@ void octree::gpuSort(my_dev::context &devContext,
int N, int numberOfBits, int subItems,
tree_structure &tree) {

#if defined(USE_THRUST) && defined(USE_THRUST_96)
//Extra buffer values
my_dev::dev_mem<uint> permutation(devContext); // Permutation values, for sorting the int4 data
my_dev::dev_mem<uint> temp_buffer(devContext); // temporary uint buffer

int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output

permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(),
tree.generalBuffer1.get_flags(),
tree.generalBuffer1.get_devMem(),
&tree.generalBuffer1[8*N], 8*N,
N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum)); //Ofset 8 since we have 2 uint4 before

prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum);

temp_buffer.cmalloc_copy(tree.generalBuffer1.get_pinned(),
tree.generalBuffer1.get_flags(),
tree.generalBuffer1.get_devMem(),
&tree.generalBuffer1[9*N], 9*N,
N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum)); //N elements after simpleKeys

thrust_sort_96b(srcValues, output, temp_buffer, permutation, N);

#else
//Extra buffer values
my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys,
my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data
Expand Down Expand Up @@ -670,7 +699,7 @@ void octree::gpuSort(my_dev::context &devContext,
reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
reOrderKeysValues.set_arg<cl_mem>(1, output.p());
reOrderKeysValues.execute();
#endif // USE_THRUST_96
}


Expand Down
25 changes: 11 additions & 14 deletions runtime/src/sort_bodies_gpu.cpp
Expand Up @@ -136,16 +136,7 @@ void octree::sort_bodies(tree_structure &tree, bool doDomainUpdate) {
printf("Corner: %f %f %f idomain fac: %f domain_fac: %f\n",
tree.corner.x, tree.corner.y, tree.corner.z, idomain_fac, domain_fac);
printf("domain fac: %f idomain_fac: %f size: %f MAXLEVELS: %d \n", domain_fac, idomain_fac, size, MAXLEVELS);

//Compute the keys
build_key_list.set_arg<cl_mem>(0, tree.bodies_key.p());
build_key_list.set_arg<cl_mem>(1, tree.bodies_Ppos.p());
build_key_list.set_arg<int>(2, &tree.n);
build_key_list.set_arg<real4>(3, &tree.corner);

build_key_list.setWork(tree.n, 128); //128 threads per block
build_key_list.execute();


//Call the GPUSort function, since we made it general
//into a uint4 so we can extend the tree to 96bit key
//we have to convert to 64bit key to a 96bit for sorting
Expand All @@ -159,14 +150,20 @@ void octree::sort_bodies(tree_structure &tree, bool doDomainUpdate) {
tree.generalBuffer1.get_devMem(),
&tree.generalBuffer1[0], 0,
tree.n, getAllignmentOffset(0));

//Sort the keys, make a copy, and store sorted in original
srcValues.copy(tree.bodies_key, tree.n);

//Compute the keys directly into srcValues
// will be sorted into tree.bodies_key below
build_key_list.set_arg<cl_mem>(0, srcValues.p());
build_key_list.set_arg<cl_mem>(1, tree.bodies_Ppos.p());
build_key_list.set_arg<int>(2, &tree.n);
build_key_list.set_arg<real4>(3, &tree.corner);

build_key_list.setWork(tree.n, 128); //128 threads per block
build_key_list.execute();

// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
gpuSort(devContext, srcValues, tree.bodies_key,srcValues, tree.n, 32, 3, tree);


devContext.stopTiming("Sorting", 0);

Expand Down

0 comments on commit 3569148

Please sign in to comment.