Skip to content

Commit

Permalink
Merge sort (#2850)
Browse files Browse the repository at this point in the history
* Update pbfgraphparser.cc

* Update pbfgraphparser.cc

* Update util.cc

* Update sequence.h

* Update pbfgraphparser.h

* Update filesystem.h

* lint

* lint

* lint

* missing include

* fix typo

* Update graphparser.cc

Fixing unit tests

* Fixing countryaccess.cc tests

* Fix more tests

* Fix test

* sequester the changes inside of sequence.h and add unit test

* update changelog, revert unneeded change

* fix mac type coersion

Co-authored-by: Kevin Kreiser <kevinkreiser@gmail.com>
  • Loading branch information
kevinventullo and kevinkreiser committed Feb 11, 2021
1 parent 68f4fa3 commit 67b8e82
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -24,6 +24,7 @@
* ADDED: Add costing option `use_living_streets` to avoid or favor living streets in route. [#2788](https://github.com/valhalla/valhalla/pull/2788)
* CHANGED: Do not allocate mapped_cache vector in skadi when no elevation source is provided. [#2841](https://github.com/valhalla/valhalla/pull/2841)
* ADDED: Added support for destination for conditional access restrictions [#2857](https://github.com/valhalla/valhalla/pull/2857)
* CHANGED: Large sequences are now merge sorted which can be dramatically faster with certain hardware configurations. This is especially useful in speeding up the earlier stages (parsing, graph construction) of tile building [2850](https://github.com/valhalla/valhalla/pull/2850)

## Release Date: 2021-01-25 Valhalla 3.1.0
* **Removed**
Expand Down
22 changes: 22 additions & 0 deletions test/util_midgard.cc
Expand Up @@ -2,8 +2,10 @@
#include "midgard/distanceapproximator.h"
#include "midgard/encoded.h"
#include "midgard/polyline2.h"
#include "midgard/sequence.h"
#include "midgard/util.h"
#include <cmath>
#include <cstdlib>
#include <random>

#include <list>
Expand Down Expand Up @@ -694,6 +696,26 @@ TEST(UtilMidgard, Base64) {
}
}

TEST(UtilMidgard, SequenceSort) {
std::vector<uint8_t> in_mem;
valhalla::midgard::sequence<uint8_t> merge("char_sequence_test_merge.bin", true, 1327);
valhalla::midgard::sequence<uint8_t> standard("char_sequence_test_standard.bin", true, 1327 * 5);

for (int i = 0; i < int(1327 * 4.5); ++i) {
auto n = static_cast<uint8_t>(rand() % std::numeric_limits<uint8_t>::max());
in_mem.push_back(n);
merge.push_back(n);
standard.push_back(n);
}

std::sort(in_mem.begin(), in_mem.end());
merge.sort(std::less<uint8_t>(), 1327);
standard.sort(std::less<uint8_t>(), 1327 * 5);

EXPECT_TRUE(std::equal(in_mem.begin(), in_mem.end(), merge.begin()));
EXPECT_TRUE(std::equal(in_mem.begin(), in_mem.end(), standard.begin()));
}

} // namespace

int main(int argc, char* argv[]) {
Expand Down
4 changes: 4 additions & 0 deletions valhalla/filesystem.h
Expand Up @@ -421,6 +421,10 @@ inline void resize_file(const path& p, std::uintmax_t new_size) {
throw std::runtime_error(std::string("Failed to resize path: ") + strerror(errno));
}

inline bool rename(const path& p, const path& q) {
return ::rename(p.c_str(), q.c_str()) == 0;
}

inline bool remove(const path& p) {
bool ret = ::remove(p.c_str()) == 0;

Expand Down
73 changes: 67 additions & 6 deletions valhalla/midgard/sequence.h
@@ -1,5 +1,4 @@
#ifndef VALHALLA_MJOLNIR_SEQUENCE_H_
#define VALHALLA_MJOLNIR_SEQUENCE_H_
#pragma once

#include <algorithm>
#include <cerrno>
Expand All @@ -14,6 +13,7 @@
#include <list>
#include <map>
#include <memory>
#include <queue>
#include <stdexcept>
#include <string>
#include <type_traits>
Expand Down Expand Up @@ -264,6 +264,8 @@ template <class T> class sequence {
// static_assert(std::is_pod<T>::value, "sequence requires POD types for now");
static const size_t npos = -1;

using value_type = T;

sequence() = delete;

sequence(const sequence&) = delete;
Expand Down Expand Up @@ -323,15 +325,70 @@ template <class T> class sequence {
return npos;
}

// sort the file based on the predicate
// sort the file based on the predicate, and outputs to output_seq
//
// Strategy is to first sort sub-ranges of length buffer_size in place.
// These should all fit in memory. Then, merge the sub-ranges into the
// output sequence via priority queue.
void sort(const std::function<bool(const T&, const T&)>& predicate,
size_t buffer_size = 1024 * 1024 * 512 / sizeof(T)) {
flush();
// if no elements we are done
if (memmap.size() == 0) {
return;
}
std::sort(static_cast<T*>(memmap), static_cast<T*>(memmap) + memmap.size(), predicate);

// If there wont be any merging we may as well take the simple approach
if (buffer_size > memmap.size() + write_buffer.size()) {
std::sort(static_cast<T*>(memmap), static_cast<T*>(memmap) + memmap.size(), predicate);
return;
}

auto tmp_path = filesystem::path(file_name).replace_filename(
filesystem::path(file_name).filename().string() + ".tmp");
{
// we need a temporary sequence to merge the sorted subsections into
sequence<T> output_seq(tmp_path.string(), true);

// Comparator needs to be inverted for pq to provide constant time *smallest* lookup
// Pq keeps track of element and its index.
auto cmp = [&predicate](const std::pair<T, int>& a, std::pair<T, int>& b) {
return predicate(b.first, a.first);
};
std::priority_queue<std::pair<T, int>, std::vector<std::pair<T, int>>, decltype(cmp)> pq(cmp);

// Sort the subsections
for (size_t i = 0; i < memmap.size(); i += buffer_size) {
std::sort(static_cast<T*>(memmap) + i,
static_cast<T*>(memmap) + std::min(memmap.size(), i + buffer_size), predicate);
pq.emplace(*at(i), i);
}

// Perform the merge
while (!pq.empty()) {
auto tmp = pq.top();
pq.pop();
output_seq.push_back(tmp.first);
auto new_idx = tmp.second + 1;
if (new_idx % buffer_size != 0 && new_idx < memmap.size()) {
pq.emplace(*at(new_idx), new_idx);
}
}
output_seq.flush();
}

// Forget about this file for a second so we can swap in the temp file
file.reset();
memmap.unmap();

// Move the sorted result back into place
filesystem::remove(file_name);
filesystem::rename(tmp_path, file_name);

// Reload the sequence
sequence<T> reloaded(file_name, false);
std::swap(file, reloaded.file);
std::swap(memmap, reloaded.memmap);
return;
}

Expand Down Expand Up @@ -381,6 +438,12 @@ template <class T> class sequence {
friend class sequence;

public:
using iterator_category = std::random_access_iterator_tag;
using value_type = T;
using difference_type = std::ptrdiff_t;
using pointer = T*;
using reference = T&;

// static_assert(std::is_pod<T>::value, "sequence_element requires POD types for now");
iterator() = delete;
iterator& operator=(const iterator& other) {
Expand Down Expand Up @@ -650,5 +713,3 @@ struct tar {

} // namespace midgard
} // namespace valhalla

#endif // VALHALLA_MJOLNIR_SEQUENCE_H_

0 comments on commit 67b8e82

Please sign in to comment.