Skip to content

Commit 67ce5e7

Browse files
authored
No Double Buffering Mode (#4)
no_double_buffering mode only uses a single std::vector per column during sorting. The order of swaps is first cached by a PermutationAnalysis and then applied to every column
1 parent 349a804 commit 67ce5e7

File tree

2 files changed

+145
-6
lines changed

2 files changed

+145
-6
lines changed

benchmarks.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,19 @@ static void BM_SoaSortBySensorId_ArrayData(benchmark::State& state) {
9292
}
9393
}
9494

95+
96+
static void BM_SoaSortBySensorId_ArrayData_NoDoubleBuffering(benchmark::State& state) {
97+
for (auto _ : state) {
98+
state.PauseTiming();
99+
auto soa = random_array_data.measurements_soa;
100+
soa.set_no_double_buffering();
101+
state.ResumeTiming();
102+
103+
soa.sort_by_field<0>();
104+
benchmark::DoNotOptimize(soa.get_column<0>()[0]);
105+
}
106+
}
107+
95108
static void BM_SoaSortBySensorId_StringData(benchmark::State& state) {
96109
for (auto _ : state) {
97110
state.PauseTiming();
@@ -158,7 +171,9 @@ static void BM_VecSumTimestamps_ArrayData(benchmark::State& state) {
158171

159172
// Register the function as a benchmark
160173
BENCHMARK(BM_SoaSortBySensorId_ArrayData);
174+
BENCHMARK(BM_SoaSortBySensorId_ArrayData_NoDoubleBuffering);
161175
BENCHMARK(BM_VecSortBySensorId_ArrayData);
176+
162177
BENCHMARK(BM_SoaSortBySensorId_StringData);
163178
BENCHMARK(BM_VecSortBySensorId_StringData);
164179

vapid/soa.h

Lines changed: 130 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,81 @@ namespace vapid {
3232
TupleDumper<std::integral_constant<uint16_t, 0>, T>::dump(ss, t);
3333
}
3434

35+
struct PermutationAnalysis {
36+
PermutationAnalysis() {}
37+
PermutationAnalysis(const std::vector<size_t>& permutation) {
38+
store_analysis(permutation);
39+
}
40+
41+
void reset(size_t perm_size) {
42+
element_visited.resize(perm_size);
43+
for (auto& b : element_visited) {
44+
b = false;
45+
}
46+
47+
cycle_sizes.resize(perm_size);
48+
for (auto& s : cycle_sizes) {
49+
s = 0;
50+
}
51+
52+
cycle_mins.resize(perm_size);
53+
for (auto& s: cycle_mins) {
54+
s = 0;
55+
}
56+
}
57+
58+
void store_analysis(const std::vector<size_t>& permutation) {
59+
reset(permutation.size());
60+
61+
size_t curr = 0;
62+
size_t cycle_min = 0;
63+
size_t cycle_len = 0;
64+
size_t num_visited = 0;
65+
size_t cycle_idx = 0;
66+
67+
while (num_visited < permutation.size()) {
68+
size_t next = permutation[curr];
69+
cycle_min = std::min(cycle_min, next);
70+
element_visited[curr] = 1;
71+
++num_visited;
72+
++cycle_len;
73+
74+
if (cycle_min == next) {
75+
cycle_mins[cycle_idx] = cycle_min;
76+
cycle_sizes[cycle_idx] = cycle_len;
77+
// we have completely explored this cycle
78+
79+
if (num_visited == permutation.size()) {
80+
// we have explored every cycle
81+
break;
82+
}
83+
84+
// walk up to the next cycle
85+
curr = cycle_min;
86+
while (element_visited[curr]) {
87+
++curr;
88+
}
89+
++cycle_idx;
90+
cycle_min = curr;
91+
cycle_len = 0;
92+
} else {
93+
curr = next;
94+
}
95+
}
96+
}
97+
98+
// work buffer to store which elements
99+
// of the permutation have been touched
100+
// only used when storing analysis
101+
std::vector<bool> element_visited;
102+
103+
// each permutation can be decomposed into cycles
104+
// these arrays store the size of each cycle
105+
// and also the minimum element of each cycle
106+
std::vector<size_t> cycle_sizes;
107+
std::vector<size_t> cycle_mins;
108+
};
109+
35110
template <typename... Ts>
36111
class soa {
37112
public:
@@ -43,6 +118,20 @@ namespace vapid {
43118
template <size_t col_idx>
44119
using col_type = typename nth_col_type<col_idx>::value_type;
45120

121+
122+
soa(bool no_double_buffering=false) : no_double_buffering_(no_double_buffering) {
123+
/* By default, each column is double-buffered by two std::vectors.
124+
* When a sort order is determined, the data from the front (unsorted)
125+
* column is moved, in order, into the back column (which is now sorted),
126+
* and the two columns swapped.
127+
*
128+
* If memory usage is a concern, no_double_buffering=true will change
129+
* this behavior. When a sort order is determined in this mode,
130+
* elements are swapped within the single buffer. This is slower
131+
* than the double-buffered approach, but uses half the memory.
132+
*/
133+
}
134+
46135
template<size_t col_idx>
47136
const nth_col_type<col_idx>& get_column() const {
48137
return std::get<col_idx>(data_);
@@ -112,6 +201,10 @@ namespace vapid {
112201
sort_order_reference_.end(),
113202
comparator_wrapper);
114203

204+
if (no_double_buffering_) {
205+
sort_order_analysis_.store_analysis(sort_order_reference_);
206+
}
207+
115208
sort_by_reference_impl(std::index_sequence_for<Ts...>{});
116209
}
117210

@@ -146,6 +239,10 @@ namespace vapid {
146239
return resize_impl(std::index_sequence_for<Ts...>{}, data_tmp_, size());
147240
}
148241

242+
void set_no_double_buffering(bool ndb = true) {
243+
no_double_buffering_ = ndb;
244+
}
245+
149246
private:
150247
template <typename T, size_t... I>
151248
void insert_impl(std::integer_sequence<size_t, I...>, T t) {
@@ -190,22 +287,49 @@ namespace vapid {
190287

191288
template <size_t col_idx>
192289
void sort_col_by_reference(std::integral_constant<size_t, col_idx>) {
193-
auto& src = std::get<col_idx>(data_);
194-
auto& dst = std::get<col_idx>(data_tmp_);
195-
dst.resize(src.size());
196-
for (size_t idx = 0; idx < src.size(); ++idx) {
197-
dst[idx] = std::move(src[sort_order_reference_[idx]]);
290+
if (no_double_buffering_) {
291+
auto& src = std::get<col_idx>(data_);
292+
size_t curr = 0;
293+
for (size_t cycle_idx = 0; cycle_idx <= sort_order_analysis_.cycle_sizes.size(); ++cycle_idx) {
294+
curr = sort_order_analysis_.cycle_mins[cycle_idx];
295+
296+
size_t cycle_size = sort_order_analysis_.cycle_sizes[cycle_idx];
297+
if (cycle_size == 0) {
298+
break;
299+
}
300+
size_t cycle_min = sort_order_analysis_.cycle_mins[cycle_idx];
301+
for (size_t i = 0; i+1 < cycle_size; ++i) {
302+
size_t next = sort_order_reference_[curr];
303+
std::swap(src[curr], src[next]);
304+
curr = next;
305+
}
306+
}
307+
} else {
308+
auto& src = std::get<col_idx>(data_);
309+
auto& dst = std::get<col_idx>(data_tmp_);
310+
311+
dst.resize(src.size());
312+
for (size_t idx = 0; idx < src.size(); ++idx) {
313+
dst[idx] = std::move(src[sort_order_reference_[idx]]);
314+
}
315+
std::swap(src, dst);
198316
}
199-
std::swap(src, dst);
200317
}
201318

319+
bool no_double_buffering_ = false;
320+
202321
std::tuple<std::vector<Ts>...> data_;
203322

204323
// tmp buffers for reordering when sorting
324+
// disable this by setting no_double_buffering=true
205325
std::tuple<std::vector<Ts>...> data_tmp_;
206326

207327
// the reference permutation describing sorted order
208328
std::vector<size_t> sort_order_reference_;
329+
330+
// permutation analysis used in single buffering mode
331+
PermutationAnalysis sort_order_analysis_;
332+
209333
};
210334

211335
template <typename... Ts>

0 commit comments

Comments
 (0)