Skip to content

Commit

Permalink
spike: use different tag map
Browse files Browse the repository at this point in the history
Building a std::map for tags is somewhat expensive, especially when
we know that the number of tags is usually quite small.

Instead, use a custom structure that does a crappy-but-fast hash
to put the keys/values in one of 16 buckets, then linear search
the bucket.

For GB, before:
```
real 1m11.507s
user 16m49.604s
sys 0m17.381s
```

After:
```
real	1m9.557s
user	16m28.826s
sys	0m17.937s
```

Saving 2 seconds of wall clock and 20 seconds of user time doesn't
seem like much, but (a) it's not nothing and (b) having the tags
in this format will enable us to thwart some of Lua's defensive
copies in a subsequent commit.

A note about the hash function: hashing each letter of the string
using boost::hash_combine eliminated the time savings.
  • Loading branch information
cldellow committed Dec 4, 2023
1 parent ee342bd commit b322166
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 22 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,15 @@ prefix = /usr/local

MANPREFIX := /usr/share/man
TM_VERSION ?= $(shell git describe --tags --abbrev=0)
CXXFLAGS ?= -O3 -Wall -Wno-unknown-pragmas -Wno-sign-compare -std=c++14 -pthread -fPIE -DTM_VERSION=$(TM_VERSION) $(CONFIG)
CXXFLAGS ?= -O3 -Wall -Wno-unknown-pragmas -Wno-sign-compare -std=c++14 -pthread -fPIE -DTM_VERSION=$(TM_VERSION) $(CONFIG) -g
LIB := -L$(PLATFORM_PATH)/lib -lz $(LUA_LIBS) -lboost_program_options -lsqlite3 -lboost_filesystem -lboost_system -lboost_iostreams -lprotobuf -lshp -pthread
INC := -I$(PLATFORM_PATH)/include -isystem ./include -I./src $(LUA_CFLAGS)

# Targets

all: tilemaker

tilemaker: include/osmformat.pb.o include/vector_tile.pb.o src/mbtiles.o src/pbf_blocks.o src/coordinates.o src/osm_store.o src/helpers.o src/output_object.o src/read_shp.o src/read_pbf.o src/osm_lua_processing.o src/write_geometry.o src/shared_data.o src/tile_worker.o src/tile_data.o src/osm_mem_tiles.o src/shp_mem_tiles.o src/attribute_store.o src/tilemaker.o src/geom.o
tilemaker: include/osmformat.pb.o include/vector_tile.pb.o src/mbtiles.o src/pbf_blocks.o src/coordinates.o src/osm_store.o src/helpers.o src/output_object.o src/read_shp.o src/read_pbf.o src/osm_lua_processing.o src/write_geometry.o src/shared_data.o src/tile_worker.o src/tile_data.o src/osm_mem_tiles.o src/shp_mem_tiles.o src/attribute_store.o src/tilemaker.o src/geom.o src/tag_map.o
$(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS)

%.o: %.cpp
Expand Down
13 changes: 8 additions & 5 deletions include/osm_lua_processing.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include <boost/container/flat_map.hpp>

class TagMap;

// Lua
extern "C" {
#include "lua.h"
Expand Down Expand Up @@ -71,19 +73,19 @@ class OsmLuaProcessing {
using tag_map_t = boost::container::flat_map<std::string, std::string>;

// Scan non-MP relation
bool scanRelation(WayID id, const tag_map_t &tags);
bool scanRelation(WayID id, const TagMap& tags);

/// \brief We are now processing a significant node
void setNode(NodeID id, LatpLon node, const tag_map_t &tags);
void setNode(NodeID id, LatpLon node, const TagMap& tags);

/// \brief We are now processing a way
void setWay(WayID wayId, LatpLonVec const &llVec, const tag_map_t &tags);
void setWay(WayID wayId, LatpLonVec const &llVec, const TagMap& tags);

/** \brief We are now processing a relation
* (note that we store relations as ways with artificial IDs, and that
* we use decrementing positive IDs to give a bit more space for way IDs)
*/
void setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const tag_map_t &tags, bool isNativeMP, bool isInnerOuter);
void setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const TagMap& tags, bool isNativeMP, bool isInnerOuter);

// ---- Metadata queries called from Lua

Expand Down Expand Up @@ -253,7 +255,8 @@ class OsmLuaProcessing {
class LayerDefinition &layers;

std::vector<std::pair<OutputObject, AttributeSet>> outputs; // All output objects that have been created
const boost::container::flat_map<std::string, std::string>* currentTags;
//const boost::container::flat_map<std::string, std::string>* currentTags;
const TagMap* currentTags;

std::vector<OutputObject> finalizeOutputs();
};
Expand Down
11 changes: 11 additions & 0 deletions include/read_pbf.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
// Protobuf
#include "osmformat.pb.h"
#include "vector_tile.pb.h"
#include "tag_map.h"

class OsmLuaProcessing;

Expand All @@ -34,6 +35,7 @@ class PbfReader
pbfreader_generate_output const &generate_output);

// Read tags into a map from a way/node/relation
/*
using tag_map_t = boost::container::flat_map<std::string, std::string>;
template<typename T>
void readTags(T &pbfObject, PrimitiveBlock const &pb, tag_map_t &tags) {
Expand All @@ -44,6 +46,15 @@ class PbfReader
tags[pb.stringtable().s(keysPtr->Get(n))] = pb.stringtable().s(valsPtr->Get(n));
}
}
*/
template<typename T>
void readTags(T &pbfObject, PrimitiveBlock const &pb, TagMap& tags) {
auto keysPtr = pbfObject.mutable_keys();
auto valsPtr = pbfObject.mutable_vals();
for (uint n=0; n < pbfObject.keys_size(); n++) {
tags.addTag(pb.stringtable().s(keysPtr->Get(n)), pb.stringtable().s(valsPtr->Get(n)));
}
}

private:
bool ReadBlock(std::istream &infile, OsmLuaProcessing &output, std::pair<std::size_t, std::size_t> progress, std::size_t datasize,
Expand Down
43 changes: 43 additions & 0 deletions include/tag_map.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#ifndef _TAG_MAP_H
#define _TAG_MAP_H

#include <vector>
#include <string>
#include <boost/container/flat_map.hpp>

// We track tags in a special structure, which enables some tricks when
// doing Lua interop.
//
// The alternative is a std::map - but often, our map is quite small.
// It's preferable to have a small set of vectors and do linear search.
//
// Further, we can avoid passing std::string from Lua -> C++ in some cases
// by first checking to see if the string we would have passed is already
// stored in our tag map, and passing a reference to its location.

// Assumptions:
// 1. Not thread-safe.
// 2. Lifetime of map is less than lifetime of keys/values that are passed.
class TagMap {
public:
TagMap();
void reset();

void addTag(const std::string& key, const std::string& value);
const std::string* getTag(const std::string& key) const;

boost::container::flat_map<std::string, std::string> exportToBoostMap() const;

private:
uint32_t ensureString(
std::vector<std::vector<const std::string*>>& vector,
const std::string& value
);


std::vector<std::vector<const std::string*>> keys;
std::vector<std::vector<uint32_t>> key2value;
std::vector<std::vector<const std::string*>> values;
};

#endif _TAG_MAP_H
21 changes: 12 additions & 9 deletions src/osm_lua_processing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "attribute_store.h"
#include "helpers.h"
#include <iostream>
#include "tag_map.h"


using namespace std;
Expand Down Expand Up @@ -152,14 +153,14 @@ string OsmLuaProcessing::Id() const {

// Check if there's a value for a given key
bool OsmLuaProcessing::Holds(const string& key) const {
return currentTags->find(key) != currentTags->end();
return currentTags->getTag(key) != nullptr;
}

// Get an OSM tag for a given key (or return empty string if none)
const string& OsmLuaProcessing::Find(const string& key) const {
auto it = currentTags->find(key);
if(it == currentTags->end()) return EMPTY_STRING;
return it->second;
auto it = currentTags->getTag(key);
if(it == nullptr) return EMPTY_STRING;
return *it;
}

// ---- Spatial queries called from Lua
Expand Down Expand Up @@ -562,7 +563,7 @@ void OsmLuaProcessing::setVectorLayerMetadata(const uint_least8_t layer, const s

// Scan relation (but don't write geometry)
// return true if we want it, false if we don't
bool OsmLuaProcessing::scanRelation(WayID id, const tag_map_t &tags) {
bool OsmLuaProcessing::scanRelation(WayID id, const TagMap& tags) {
reset();
originalOsmID = id;
isWay = false;
Expand All @@ -576,11 +577,13 @@ bool OsmLuaProcessing::scanRelation(WayID id, const tag_map_t &tags) {
}
if (!relationAccepted) return false;

osmStore.store_relation_tags(id, tags);
// If we're persisting, we need to make a real map that owns its
// own keys and values.
osmStore.store_relation_tags(id, tags.exportToBoostMap());
return true;
}

void OsmLuaProcessing::setNode(NodeID id, LatpLon node, const tag_map_t &tags) {
void OsmLuaProcessing::setNode(NodeID id, LatpLon node, const TagMap& tags) {

reset();
originalOsmID = id;
Expand Down Expand Up @@ -608,7 +611,7 @@ void OsmLuaProcessing::setNode(NodeID id, LatpLon node, const tag_map_t &tags) {
}

// We are now processing a way
void OsmLuaProcessing::setWay(WayID wayId, LatpLonVec const &llVec, const tag_map_t &tags) {
void OsmLuaProcessing::setWay(WayID wayId, LatpLonVec const &llVec, const TagMap& tags) {
reset();
originalOsmID = wayId;
isWay = true;
Expand Down Expand Up @@ -654,7 +657,7 @@ void OsmLuaProcessing::setWay(WayID wayId, LatpLonVec const &llVec, const tag_ma
}

// We are now processing a relation
void OsmLuaProcessing::setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const tag_map_t &tags,
void OsmLuaProcessing::setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const TagMap& tags,
bool isNativeMP, // only OSM type=multipolygon
bool isInnerOuter) { // any OSM relation with "inner" and "outer" roles (e.g. type=multipolygon|boundary)
reset();
Expand Down
18 changes: 12 additions & 6 deletions src/read_pbf.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <iostream>
#include "read_pbf.h"
#include "pbf_blocks.h"
#include "tag_map.h"

#include <boost/interprocess/streams/bufferstream.hpp>
#include <boost/asio/thread_pool.hpp>
Expand All @@ -18,6 +19,7 @@ PbfReader::PbfReader(OSMStore &osmStore)
bool PbfReader::ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, const unordered_set<int> &nodeKeyPositions)
{
// ---- Read nodes
TagMap tags;

if (pg.has_dense()) {
int64_t nodeId = 0;
Expand Down Expand Up @@ -49,11 +51,11 @@ bool PbfReader::ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitiv

if (significant) {
// For tagged nodes, call Lua, then save the OutputObject
boost::container::flat_map<std::string, std::string> tags;
tags.reserve(kvPos / 2);
tags.reset();

for (uint n=kvStart; n<kvPos-1; n+=2) {
tags[pb.stringtable().s(dense.keys_vals(n))] = pb.stringtable().s(dense.keys_vals(n+1));
tags.addTag(pb.stringtable().s(dense.keys_vals(n)), pb.stringtable().s(dense.keys_vals(n+1)));

}
output.setNode(static_cast<NodeID>(nodeId), node, tags);
}
Expand All @@ -70,6 +72,7 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
// ---- Read ways

if (pg.ways_size() > 0) {
TagMap tags;
Way pbfWay;

std::vector<WayStore::element_t> ways;
Expand Down Expand Up @@ -103,7 +106,7 @@ bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, Primitive
if (llVec.empty()) continue;

try {
tag_map_t tags;
tags.reset();
readTags(pbfWay, pb, tags);

// If we need it for later, store the way's coordinates in the global way store
Expand Down Expand Up @@ -132,14 +135,16 @@ bool PbfReader::ScanRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, Prim
int typeKey = findStringPosition(pb, "type");
int mpKey = findStringPosition(pb, "multipolygon");

TagMap tags;

for (int j=0; j<pg.relations_size(); j++) {
Relation pbfRelation = pg.relations(j);
bool isMultiPolygon = RelationIsType(pbfRelation, typeKey, mpKey);
bool isAccepted = false;
WayID relid = static_cast<WayID>(pbfRelation.id());
if (!isMultiPolygon) {
if (output.canReadRelations()) {
tag_map_t tags;
tags.reset();
readTags(pbfRelation, pb, tags);
isAccepted = output.scanRelation(relid, tags);
}
Expand All @@ -161,6 +166,7 @@ bool PbfReader::ReadRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, Prim
// ---- Read relations

if (pg.relations_size() > 0) {
TagMap tags;
std::vector<RelationStore::element_t> relations;

int typeKey = findStringPosition(pb, "type");
Expand Down Expand Up @@ -189,7 +195,7 @@ bool PbfReader::ReadRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, Prim
}

try {
tag_map_t tags;
tags.reset();
readTags(pbfRelation, pb, tags);
output.setRelation(pbfRelation.id(), outerWayVec, innerWayVec, tags, isMultiPolygon, isInnerOuter);

Expand Down
91 changes: 91 additions & 0 deletions src/tag_map.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include "tag_map.h"
#include <boost/functional/hash.hpp>
#include <iostream>

TagMap::TagMap() {
keys.resize(16);
key2value.resize(16);
values.resize(16);
}

void TagMap::reset() {
for (int i = 0; i < 16; i++) {
keys[i].clear();
key2value[i].clear();
values[i].clear();
}
}

const std::size_t hashString(const std::string& str) {
// This is a pretty crappy hash function in terms of bit
// avalanching and distribution of output values.
//
// But it's very good in terms of speed, which turns out
// to be the important measure.
std::size_t hash = str.size();
if (hash >= 4)
hash ^= *(uint32_t*)str.data();

return hash;
}

uint32_t TagMap::ensureString(
std::vector<std::vector<const std::string*>>& vector,
const std::string& value
) {
std::size_t hash = hashString(value);

const uint16_t shard = hash % vector.size();
for (int i = 0; i < vector[shard].size(); i++)
if (*(vector[shard][i]) == value)
return shard << 16 | i;

vector[shard].push_back(&value);
return shard << 16 | (vector[shard].size() - 1);
}


void TagMap::addTag(const std::string& key, const std::string& value) {
uint32_t valueLoc = ensureString(values, value);
// std::cout << "valueLoc = " << valueLoc << std::endl;
uint32_t keyLoc = ensureString(keys, key);
// std::cout << "keyLoc = " << keyLoc << std::endl;


const uint16_t shard = keyLoc >> 16;
const uint16_t pos = keyLoc;
// std::cout << "shard=" << shard << ", pos=" << pos << std::endl;
if (key2value[shard].size() <= pos) {
// std::cout << "growing shard" << std::endl;
key2value[shard].resize(pos + 1);
}

key2value[shard][pos] = valueLoc;
}

const std::string* TagMap::getTag(const std::string& key) const {
// Returns nullptr if absent, else pointer to value.
std::size_t hash = hashString(key);

const uint16_t shard = hash % keys.size();
for (int i = 0; i < keys[shard].size(); i++)
if (*(keys[shard][i]) == key) {
const uint32_t valueLoc = key2value[shard][i];
return values[valueLoc >> 16][valueLoc & 0xFFFF];
}

return nullptr;
}

boost::container::flat_map<std::string, std::string> TagMap::exportToBoostMap() const {
boost::container::flat_map<std::string, std::string> rv;

for (int i = 0; i < keys.size(); i++) {
for (int j = 0; j < keys[i].size(); j++) {
uint32_t valueLoc = key2value[i][j];
rv[*keys[i][j]] = *values[valueLoc >> 16][valueLoc & 0xFFFF];
}
}

return rv;
}
1 change: 1 addition & 0 deletions src/tilemaker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ int main(int argc, char* argv[]) {
}

// ---- Write out data
return 0; // TODO

// If mapsplit, read list of tiles available
unsigned runs=1;
Expand Down

0 comments on commit b322166

Please sign in to comment.