No Double Buffering Mode (#4)

markisus · web-flow · commit 67ce5e7c1926 · 2022-02-11T12:41:26.000-08:00
no_double_buffering mode only uses a single std::vector per column during sorting. The order of swaps is first cached by a PermutationAnalysis and then applied to every column
diff --git a/benchmarks.cc b/benchmarks.cc
@@ -92,6 +92,19 @@ static void BM_SoaSortBySensorId_ArrayData(benchmark::State& state) {
     }
 }
 
+
+static void BM_SoaSortBySensorId_ArrayData_NoDoubleBuffering(benchmark::State& state) {
+    for (auto _ : state) {
+        state.PauseTiming();
+        auto soa = random_array_data.measurements_soa;
+        soa.set_no_double_buffering();
+        state.ResumeTiming();
+
+        soa.sort_by_field<0>();
+        benchmark::DoNotOptimize(soa.get_column<0>()[0]);
+    }
+}
+
 static void BM_SoaSortBySensorId_StringData(benchmark::State& state) {
     for (auto _ : state) {
         state.PauseTiming();
@@ -158,7 +171,9 @@ static void BM_VecSumTimestamps_ArrayData(benchmark::State& state) {
 
 // Register the function as a benchmark
 BENCHMARK(BM_SoaSortBySensorId_ArrayData);
+BENCHMARK(BM_SoaSortBySensorId_ArrayData_NoDoubleBuffering);
 BENCHMARK(BM_VecSortBySensorId_ArrayData);
+
 BENCHMARK(BM_SoaSortBySensorId_StringData);
 BENCHMARK(BM_VecSortBySensorId_StringData);
 
diff --git a/vapid/soa.h b/vapid/soa.h
@@ -32,6 +32,81 @@ namespace vapid {
         TupleDumper<std::integral_constant<uint16_t, 0>, T>::dump(ss, t);
     }
 
+    struct PermutationAnalysis {
+        PermutationAnalysis() {}
+        PermutationAnalysis(const std::vector<size_t>& permutation) {
+            store_analysis(permutation);
+        }
+
+        void reset(size_t perm_size) {
+            element_visited.resize(perm_size);
+            for (auto& b : element_visited) {
+                b = false;
+            }
+
+            cycle_sizes.resize(perm_size);
+            for (auto& s : cycle_sizes) {
+                s = 0;
+            }
+
+            cycle_mins.resize(perm_size);
+            for (auto& s: cycle_mins) {
+                s = 0;
+            }
+        }
+
+        void store_analysis(const std::vector<size_t>& permutation) {
+            reset(permutation.size());
+
+            size_t curr = 0;
+            size_t cycle_min = 0;
+            size_t cycle_len = 0;
+            size_t num_visited = 0;
+            size_t cycle_idx = 0;
+
+            while (num_visited < permutation.size()) {
+                size_t next = permutation[curr];
+                cycle_min = std::min(cycle_min, next);
+                element_visited[curr] = 1;
+                ++num_visited;
+                ++cycle_len;
+
+                if (cycle_min == next) {
+                    cycle_mins[cycle_idx] = cycle_min;
+                    cycle_sizes[cycle_idx] = cycle_len;
+                    // we have completely explored this cycle
+
+                    if (num_visited == permutation.size()) {
+                        // we have explored every cycle
+                        break;
+                    }
+
+                    // walk up to the next cycle
+                    curr = cycle_min;
+                    while (element_visited[curr]) {
+                        ++curr;
+                    }
+                    ++cycle_idx;
+                    cycle_min = curr;
+                    cycle_len = 0;
+                } else {
+                    curr = next;
+                }
+            }
+        }
+
+        // work buffer to store which elements
+        // of the permutation have been touched
+        // only used when storing analysis
+        std::vector<bool> element_visited;
+
+        // each permutation can be decomposed into cycles
+        // these arrays store the size of each cycle
+        // and also the minimum element of each cycle
+        std::vector<size_t> cycle_sizes;
+        std::vector<size_t> cycle_mins;
+    };
+
     template <typename... Ts>
     class soa {
     public:
@@ -43,6 +118,20 @@ namespace vapid {
         template <size_t col_idx>
         using col_type = typename nth_col_type<col_idx>::value_type;
 
+
+        soa(bool no_double_buffering=false) : no_double_buffering_(no_double_buffering) {
+            /* By default, each column is double-buffered by two std::vectors.
+             * When a sort order is determined, the data from the front (unsorted)
+             * column is moved, in order, into the back column (which is now sorted),
+             * and the two columns swapped.
+             *
+             * If memory usage is a concern, no_double_buffering=true will change
+             * this behavior. When a sort order is determined in this mode,
+             * elements are swapped within the single buffer. This is slower
+             * than the double-buffered approach, but uses half the memory.
+             */
+        }
+
         template<size_t col_idx>
         const nth_col_type<col_idx>& get_column() const {
             return std::get<col_idx>(data_);
@@ -112,6 +201,10 @@ namespace vapid {
                 sort_order_reference_.end(),
                 comparator_wrapper);
 
+            if (no_double_buffering_) {
+                sort_order_analysis_.store_analysis(sort_order_reference_);
+            }
+
             sort_by_reference_impl(std::index_sequence_for<Ts...>{});
         }
 
@@ -146,6 +239,10 @@ namespace vapid {
             return resize_impl(std::index_sequence_for<Ts...>{}, data_tmp_, size());
         }
 
+        void set_no_double_buffering(bool ndb = true) {
+            no_double_buffering_ = ndb;
+        }
+
     private:
         template <typename T, size_t... I>
         void insert_impl(std::integer_sequence<size_t, I...>, T t) {
@@ -190,22 +287,49 @@ namespace vapid {
 
         template <size_t col_idx>
         void sort_col_by_reference(std::integral_constant<size_t, col_idx>) {
-            auto& src = std::get<col_idx>(data_);
-            auto& dst = std::get<col_idx>(data_tmp_);
-            dst.resize(src.size());
-            for (size_t idx = 0; idx < src.size(); ++idx) {
-                dst[idx] = std::move(src[sort_order_reference_[idx]]);
+            if (no_double_buffering_) {
+                auto& src = std::get<col_idx>(data_);
+                size_t curr = 0;
+                for (size_t cycle_idx = 0; cycle_idx <= sort_order_analysis_.cycle_sizes.size(); ++cycle_idx) {
+                    curr = sort_order_analysis_.cycle_mins[cycle_idx];
+
+                    size_t cycle_size = sort_order_analysis_.cycle_sizes[cycle_idx];
+                    if (cycle_size == 0) {
+                        break;
+                    }
+                    size_t cycle_min = sort_order_analysis_.cycle_mins[cycle_idx];
+                    for (size_t i = 0; i+1 < cycle_size; ++i) {
+                        size_t next = sort_order_reference_[curr];
+                        std::swap(src[curr], src[next]);
+                        curr = next;
+                    }
+                }
+            } else {
+                auto& src = std::get<col_idx>(data_);
+                auto& dst = std::get<col_idx>(data_tmp_);
+
+                dst.resize(src.size());
+                for (size_t idx = 0; idx < src.size(); ++idx) {
+                    dst[idx] = std::move(src[sort_order_reference_[idx]]);
+                }
+                std::swap(src, dst);
             }
-            std::swap(src, dst);
         }
 
+        bool no_double_buffering_ = false;
+
         std::tuple<std::vector<Ts>...> data_;
 
         // tmp buffers for reordering when sorting
+        // disable this by setting no_double_buffering=true
         std::tuple<std::vector<Ts>...> data_tmp_;
 
         // the reference permutation describing sorted order
         std::vector<size_t> sort_order_reference_;
+
+        // permutation analysis used in single buffering mode
+        PermutationAnalysis sort_order_analysis_;
+
     };
 
     template <typename... Ts>