Permalink
Browse files

Eliminated a memcpy in sorting, and added thrust_sort_96b, a 96-bit l…

…exicographical sort entirely in thrust. Slightly faster in my tests on my laptop. TODO: test on faster GPU.
  • Loading branch information...
1 parent adf7901 commit 356914886d1f91251741ffaf161b578262fdf489 @harrism harrism committed Apr 24, 2012
Showing with 105 additions and 17 deletions.
  1. +63 −1 runtime/CUDAkernels/support_kernels.cu
  2. +31 −2 runtime/src/load_kernels.cpp
  3. +11 −14 runtime/src/sort_bodies_gpu.cpp
@@ -10,7 +10,9 @@
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <thrust/sort.h>
- #include <thrust/device_vector.h>
+ #include <thrust/gather.h>
+ #include <thrust/device_vector.h>
+ #include <thrust/iterator/transform_iterator.h>
#include "../include/my_cuda_rt.h"
@@ -30,6 +32,66 @@
valuesOutput.copy(srcValues, N);
}
+ template <int keyIdx>
+ struct ExtractBits : public thrust::unary_function<uint4, uint>
+ {
+ __host__ __device__
+ uint operator()(uint4 key) const {
+ if(keyIdx == 0)
+ return key.x;
+ else if(keyIdx == 1)
+ return key.y;
+ else
+ return key.z;
+ }
+ };
+
+ template <int keyIdx, typename KeyPtr, typename PermutationPtr, typename ExtractedPtr>
+ void update_permutation(KeyPtr& keys, PermutationPtr& permutation, ExtractedPtr& temp, int N)
+ {
+ // permute the keys with the current reordering
+ thrust::gather(permutation, permutation + N,
+ thrust::make_transform_iterator(keys, ExtractBits<keyIdx>()), temp);
+
+ // stable_sort the permuted keys and update the permutation
+ thrust::stable_sort_by_key(temp, temp + N, permutation);
+ }
+
+
+ template <typename KeyPtr, typename PermutationPtr, typename OutputPtr>
+ void apply_permutation(KeyPtr& keys, PermutationPtr& permutation, OutputPtr& out, int N)
+ {
+ // permute the keys into out vector
+ thrust::gather(permutation, permutation + N, keys, out);
+ }
+
+ extern "C" void thrust_sort_96b(my_dev::dev_mem<uint4> &srcKeys,
+ my_dev::dev_mem<uint4> &sortedKeys,
+ my_dev::dev_mem<uint> &temp_buffer,
+ my_dev::dev_mem<uint> &permutation_buffer,
+ int N)
+ {
+
+ // wrap raw pointer with a device_ptr
+ thrust::device_ptr<uint4> keys = thrust::device_pointer_cast(srcKeys.raw_p());
+ thrust::device_ptr<uint4> outKeys = thrust::device_pointer_cast(sortedKeys.raw_p());
+ thrust::device_ptr<uint> temp = thrust::device_pointer_cast(temp_buffer.raw_p());
+ thrust::device_ptr<uint> permutation = thrust::device_pointer_cast(permutation_buffer.raw_p());
+
+ // initialize permutation to [0, 1, 2, ... ,N-1]
+ thrust::sequence(permutation, permutation + N);
+
+ // sort z, y, x
+ update_permutation<2>(keys, permutation, temp, N);
+ update_permutation<1>(keys, permutation, temp, N);
+ update_permutation<0>(keys, permutation, temp, N);
+
+ // Note: keys have not been modified
+ // Note: permutation now maps unsorted keys to sorted order
+
+ thrust::gather(permutation, permutation + N, keys, outKeys);
+ }
+
#endif
@@ -1,14 +1,19 @@
#include "octree.h"
// #define USE_THRUST
-
+// #define USE_THRUST_96
#ifdef USE_THRUST
extern "C" void thrust_sort_32b(my_dev::context &devContext,
my_dev::dev_mem<uint> &srcKeys, my_dev::dev_mem<uint> &srcValues,
my_dev::dev_mem<int> &keysOutput, my_dev::dev_mem<uint> &keysAPing,
my_dev::dev_mem<uint> &valuesOutput,my_dev::dev_mem<uint> &valuesAPing,
int N, int numberOfBits);
+ extern "C" void thrust_sort_96b(my_dev::dev_mem<uint4> &srcKeys,
+ my_dev::dev_mem<uint4> &sortedKeys,
+ my_dev::dev_mem<uint> &temp_buffer,
+ my_dev::dev_mem<uint> &permutation_buffer,
+ int N);
extern "C" void thrust_gpuCompact(my_dev::context &devContext,
my_dev::dev_mem<uint> &srcValues,
my_dev::dev_mem<uint> &output,
@@ -497,6 +502,30 @@ void octree::gpuSort(my_dev::context &devContext,
int N, int numberOfBits, int subItems,
tree_structure &tree) {
+#if defined(USE_THRUST) && defined(USE_THRUST_96)
+ //Extra buffer values
+ my_dev::dev_mem<uint> permutation(devContext); // Permutation values, for sorting the int4 data
+ my_dev::dev_mem<uint> temp_buffer(devContext); // temporary uint buffer
+
+ int prevOffsetSum = getAllignmentOffset(4*N); //The offset of output
+
+ permutation.cmalloc_copy(tree.generalBuffer1.get_pinned(),
+ tree.generalBuffer1.get_flags(),
+ tree.generalBuffer1.get_devMem(),
+ &tree.generalBuffer1[8*N], 8*N,
+ N, prevOffsetSum + getAllignmentOffset(8*N + prevOffsetSum)); //Ofset 8 since we have 2 uint4 before
+
+ prevOffsetSum += getAllignmentOffset(8*N + prevOffsetSum);
+
+ temp_buffer.cmalloc_copy(tree.generalBuffer1.get_pinned(),
+ tree.generalBuffer1.get_flags(),
+ tree.generalBuffer1.get_devMem(),
+ &tree.generalBuffer1[9*N], 9*N,
+ N, prevOffsetSum + getAllignmentOffset(9*N + prevOffsetSum)); //N elements after simpleKeys
+
+ thrust_sort_96b(srcValues, output, temp_buffer, permutation, N);
+
+#else
//Extra buffer values
my_dev::dev_mem<uint> simpleKeys(devContext); //Int keys,
my_dev::dev_mem<uint> permutation(devContext); //Permutation values, for sorting the int4 data
@@ -670,7 +699,7 @@ void octree::gpuSort(my_dev::context &devContext,
reOrderKeysValues.set_arg<cl_mem>(0, buffer.p());
reOrderKeysValues.set_arg<cl_mem>(1, output.p());
reOrderKeysValues.execute();
-
+#endif // USE_THRUST_96
}
@@ -136,16 +136,7 @@ void octree::sort_bodies(tree_structure &tree, bool doDomainUpdate) {
printf("Corner: %f %f %f idomain fac: %f domain_fac: %f\n",
tree.corner.x, tree.corner.y, tree.corner.z, idomain_fac, domain_fac);
printf("domain fac: %f idomain_fac: %f size: %f MAXLEVELS: %d \n", domain_fac, idomain_fac, size, MAXLEVELS);
-
- //Compute the keys
- build_key_list.set_arg<cl_mem>(0, tree.bodies_key.p());
- build_key_list.set_arg<cl_mem>(1, tree.bodies_Ppos.p());
- build_key_list.set_arg<int>(2, &tree.n);
- build_key_list.set_arg<real4>(3, &tree.corner);
-
- build_key_list.setWork(tree.n, 128); //128 threads per block
- build_key_list.execute();
-
+
//Call the GPUSort function, since we made it general
//into a uint4 so we can extend the tree to 96bit key
//we have to convert to 64bit key to a 96bit for sorting
@@ -159,14 +150,20 @@ void octree::sort_bodies(tree_structure &tree, bool doDomainUpdate) {
tree.generalBuffer1.get_devMem(),
&tree.generalBuffer1[0], 0,
tree.n, getAllignmentOffset(0));
-
- //Sort the keys, make a copy, and store sorted in original
- srcValues.copy(tree.bodies_key, tree.n);
+
+ //Compute the keys directly into srcValues
+ // will be sorted into tree.bodies_key below
+ build_key_list.set_arg<cl_mem>(0, srcValues.p());
+ build_key_list.set_arg<cl_mem>(1, tree.bodies_Ppos.p());
+ build_key_list.set_arg<int>(2, &tree.n);
+ build_key_list.set_arg<real4>(3, &tree.corner);
+
+ build_key_list.setWork(tree.n, 128); //128 threads per block
+ build_key_list.execute();
// If srcValues and buffer are different, then the original values
// are preserved, if they are the same srcValues will be overwritten
gpuSort(devContext, srcValues, tree.bodies_key,srcValues, tree.n, 32, 3, tree);
-
devContext.stopTiming("Sorting", 0);

0 comments on commit 3569148

Please sign in to comment.