diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad353d84..803fb140 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,10 +81,6 @@ ADD_CUSTOM_COMMAND(OUTPUT vector_tile.pb.cc vector_tile.pb.h
                    COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
 		   ARGS --cpp_out ${CMAKE_BINARY_DIR} -I ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/include/vector_tile.proto)
 
-ADD_CUSTOM_COMMAND(OUTPUT osmformat.pb.cc osmformat.pb.h
-                   COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-		   ARGS --cpp_out ${CMAKE_BINARY_DIR} -I ${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/include/osmformat.proto)
-
 file(GLOB tilemaker_src_files
 	src/attribute_store.cpp
 	src/coordinates.cpp
@@ -97,25 +93,30 @@ file(GLOB tilemaker_src_files
 	src/mbtiles.cpp
 	src/mmap_allocator.cpp
 	src/node_stores.cpp
+	src/options_parser.cpp
 	src/osm_lua_processing.cpp
 	src/osm_mem_tiles.cpp
 	src/osm_store.cpp
 	src/output_object.cpp
-	src/pbf_blocks.cpp
+	src/pbf_processor.cpp
+	src/pbf_reader.cpp
 	src/pmtiles.cpp
-	src/read_pbf.cpp
+	src/pooled_string.cpp
 	src/read_shp.cpp
+	src/sharded_node_store.cpp
+	src/sharded_way_store.cpp
 	src/shared_data.cpp
 	src/shp_mem_tiles.cpp
 	src/sorted_node_store.cpp
 	src/sorted_way_store.cpp
+	src/tag_map.cpp
 	src/tile_data.cpp
 	src/tilemaker.cpp
 	src/tile_worker.cpp
 	src/way_stores.cpp
 	src/write_geometry.cpp
   )
-add_executable(tilemaker vector_tile.pb.cc osmformat.pb.cc ${tilemaker_src_files})
+add_executable(tilemaker vector_tile.pb.cc ${tilemaker_src_files})
 target_include_directories(tilemaker PRIVATE include)
 target_include_directories(tilemaker PRIVATE ${CMAKE_BINARY_DIR}) # for generated files
 target_link_libraries(tilemaker
diff --git a/Makefile b/Makefile
index 45b7c8af..1ac184f1 100644
--- a/Makefile
+++ b/Makefile
@@ -93,7 +93,6 @@ INC := -I$(PLATFORM_PATH)/include -isystem ./include -I./src $(LUA_CFLAGS)
 all: tilemaker
 
 tilemaker: \
-	include/osmformat.pb.o \
 	include/vector_tile.pb.o \
 	src/attribute_store.o \
 	src/coordinates_geom.o \
@@ -106,18 +105,23 @@ tilemaker: \
 	src/mbtiles.o \
 	src/mmap_allocator.o \
 	src/node_stores.o \
+	src/options_parser.o \
 	src/osm_lua_processing.o \
 	src/osm_mem_tiles.o \
 	src/osm_store.o \
 	src/output_object.o \
-	src/pbf_blocks.o \
+	src/pbf_processor.o \
+	src/pbf_reader.o \
 	src/pmtiles.o \
-	src/read_pbf.o \
+	src/pooled_string.o \
 	src/read_shp.o \
+	src/sharded_node_store.o \
+	src/sharded_way_store.o \
 	src/shared_data.o \
 	src/shp_mem_tiles.o \
 	src/sorted_node_store.o \
 	src/sorted_way_store.o \
+	src/tag_map.o \
 	src/tile_data.o \
 	src/tilemaker.o \
 	src/tile_worker.o \
@@ -125,7 +129,50 @@ tilemaker: \
 	src/write_geometry.o
 	$(CXX) $(CXXFLAGS) -o tilemaker $^ $(INC) $(LIB) $(LDFLAGS)
 
-test: test_sorted_way_store
+test: \
+	test_append_vector \
+	test_attribute_store \
+	test_deque_map \
+	test_pbf_reader \
+	test_pooled_string \
+	test_sorted_node_store \
+	test_sorted_way_store
+
+test_append_vector: \
+	src/mmap_allocator.o \
+	test/append_vector.test.o
+	$(CXX) $(CXXFLAGS) -o test.append_vector $^ $(INC) $(LIB) $(LDFLAGS) && ./test.append_vector
+
+test_attribute_store: \
+	src/mmap_allocator.o \
+	src/attribute_store.o \
+	src/pooled_string.o \
+	test/attribute_store.test.o
+	$(CXX) $(CXXFLAGS) -o test.attribute_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.attribute_store
+
+test_deque_map: \
+	test/deque_map.test.o
+	$(CXX) $(CXXFLAGS) -o test.deque_map $^ $(INC) $(LIB) $(LDFLAGS) && ./test.deque_map
+
+test_options_parser: \
+	src/options_parser.o \
+	test/options_parser.test.o
+	$(CXX) $(CXXFLAGS) -o test.options_parser $^ $(INC) $(LIB) $(LDFLAGS) && ./test.options_parser
+
+test_pooled_string: \
+	src/mmap_allocator.o \
+	src/pooled_string.o \
+	test/pooled_string.test.o
+	$(CXX) $(CXXFLAGS) -o test.pooled_string $^ $(INC) $(LIB) $(LDFLAGS) && ./test.pooled_string
+
+test_sorted_node_store: \
+	src/external/streamvbyte_decode.o \
+	src/external/streamvbyte_encode.o \
+	src/external/streamvbyte_zigzag.o \
+	src/mmap_allocator.o \
+	src/sorted_node_store.o \
+	test/sorted_node_store.test.o
+	$(CXX) $(CXXFLAGS) -o test.sorted_node_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.sorted_node_store
 
 test_sorted_way_store: \
 	src/external/streamvbyte_decode.o \
@@ -133,9 +180,14 @@ test_sorted_way_store: \
 	src/external/streamvbyte_zigzag.o \
 	src/mmap_allocator.o \
 	src/sorted_way_store.o \
-	src/sorted_way_store.test.o
+	test/sorted_way_store.test.o
 	$(CXX) $(CXXFLAGS) -o test.sorted_way_store $^ $(INC) $(LIB) $(LDFLAGS) && ./test.sorted_way_store
 
+test_pbf_reader: \
+	src/helpers.o \
+	src/pbf_reader.o \
+	test/pbf_reader.test.o
+	$(CXX) $(CXXFLAGS) -o test.pbf_reader $^ $(INC) $(LIB) $(LDFLAGS) && ./test.pbf_reader
 
 %.o: %.cpp
 	$(CXX) $(CXXFLAGS) -o $@ -c $< $(INC)
@@ -153,6 +205,6 @@ install:
 	install docs/man/tilemaker.1 ${DESTDIR}${MANPREFIX}/man1/
 
 clean:
-	rm -f tilemaker src/*.o src/external/*.o include/*.o include/*.pb.h
+	rm -f tilemaker src/*.o src/external/*.o include/*.o include/*.pb.h test/*.o
 
 .PHONY: install
diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md
index d41fba9b..d605d153 100644
--- a/docs/CONFIGURATION.md
+++ b/docs/CONFIGURATION.md
@@ -107,13 +107,16 @@ For example:
 	
 ### Lua processing
 
-Your Lua file needs to supply 5 things:
+Your Lua file needs to supply a few things:
 
 1. `node_keys`, a list of those OSM keys which indicate that a node should be processed
-2. `init_function(name)` (optional), a function to initialize Lua logic
-2. `node_function(node)`, a function to process an OSM node and add it to layers
-3. `way_function(way)`, a function to process an OSM way and add it to layers
-3. `exit_function` (optional), a function to finalize Lua logic (useful to show statistics)
+2. `node_function()`, a function to process an OSM node and add it to layers
+3. `way_function()`, a function to process an OSM way and add it to layers
+4. (optional) `init_function(name)`, a function to initialize Lua logic
+5. (optional) `exit_function`, a function to finalize Lua logic (useful to show statistics)
+6. (optional) `relation_scan_function`, a function to determine whether your Lua file wishes to process the given relation 
+7. (optional) `relation_function`, a function to process an OSM relation and add it to layers
+8. (optional) `attribute_function`, a function to remap attributes from shapefiles
 
 `node_keys` is a simple list (or in Lua parlance, a 'table') of OSM tag keys. If a node has one of those keys, it will be processed by `node_function`; if not, it'll be skipped. For example, if you wanted to show highway crossings and railway stations, it should be `{ "highway", "railway" }`. (This avoids the need to process the vast majority of nodes which contain no important tags at all.)
 
@@ -127,28 +130,30 @@ Note the order: you write to a layer first, then set attributes after.
 
 To do that, you use these methods:
 
-* `node:Find(key)` or `way:Find(key)`: get the value for a tag, or the empty string if not present. For example, `way:Find("railway")` might return "rail" for a railway, "siding" for a siding, or "" if it isn't a railway at all.
-* `node:Holds(key)` or `way:Holds(key)`: returns true if that key exists, false otherwise.
-* `node:Layer("layer_name", false)` or `way:Layer("layer_name", is_area)`: write this node/way to the named layer. This is how you put objects in your vector tile. is_area (true/false) specifies whether a way should be treated as an area, or just as a linestring.
-* `way:LayerAsCentroid("layer_name")`: write a single centroid point for this way to the named layer (useful for labels and POIs).
-* `node:Attribute(key,value,minzoom)` or `node:Attribute(key,value,minzoom)`: add an attribute to the most recently written layer. Argument `minzoom` is optional, use it if you do not want to write the attribute on lower zoom levels.
-* `node:AttributeNumeric(key,value,minzoom)`, `node:AttributeBoolean(key,value,minzoom)` (and `way:`...): for numeric/boolean columns.
-* `node:Id()` or `way:Id()`: get the OSM ID of the current object.
-* `node:ZOrder(number)` or `way:ZOrder(number)`: Set a numeric value (default 0, 1-byte signed integer) used to sort features within a layer. Use this feature to ensure a proper rendering order if the rendering engine itself does not support sorting. Sorting is not supported across layers merged with `write_to`. Features with different z-order are not merged if `combine_below` or `combine_polygons_below` is used.
-* `node:MinZoom(zoom)` or `way:MinZoom(zoom)`: set the minimum zoom level (0-15) at which this object will be written. Note that the JSON layer configuration minimum still applies (so `:MinZoom(5)` will have no effect if your layer only starts at z6).
-* `way:Length()` and `way:Area()`: return the length (metres)/area (square metres) of the current object. Requires recent Boost.
-* `way:Centroid()`: return the lat/lon of the centre of the current object as a two-element Lua table (element 1 is lat, 2 is lon).
+* `Find(key)`: get the value for a tag, or the empty string if not present. For example, `Find("railway")` might return "rail" for a railway, "siding" for a siding, or "" if it isn't a railway at all.
+* `Holds(key)`: returns true if that key exists, false otherwise.
+* `Layer("layer_name", is_area)`: write this node/way to the named layer. This is how you put objects in your vector tile. is_area (true/false) specifies whether a way should be treated as an area, or just as a linestring.
+* `LayerAsCentroid("layer_name")`: write a single centroid point for this way to the named layer (useful for labels and POIs).
+* `Attribute(key,value,minzoom)`: add an attribute to the most recently written layer. Argument `minzoom` is optional, use it if you do not want to write the attribute on lower zoom levels.
+* `AttributeNumeric(key,value,minzoom)`, `AttributeBoolean(key,value,minzoom)`: for numeric/boolean columns.
+* `Id()`: get the OSM ID of the current object.
+* `ZOrder(number)`: Set a numeric value (default 0, 1-byte signed integer) used to sort features within a layer. Use this feature to ensure a proper rendering order if the rendering engine itself does not support sorting. Sorting is not supported across layers merged with `write_to`. Features with different z-order are not merged if `combine_below` or `combine_polygons_below` is used.
+* `MinZoom(zoom)`: set the minimum zoom level (0-15) at which this object will be written. Note that the JSON layer configuration minimum still applies (so `:MinZoom(5)` will have no effect if your layer only starts at z6).
+* `Length()` and `Area()`: return the length (metres)/area (square metres) of the current object. Requires recent Boost.
+* `Centroid()`: return the lat/lon of the centre of the current object as a two-element Lua table (element 1 is lat, 2 is lon).
 
 The simplest possible function, to include roads/paths and nothing else, might look like this:
 
-    function way_function(way)
-      local highway = way:Find("highway")
+```lua
+    function way_function()
+      local highway = Find("highway")
       if highway~="" then
-        way:Layer("roads", false)
-        way:Attribute("name", way:Find("name"))
-        way:Attribute("type", highway)
+        Layer("roads", false)
+        Attribute("name", Find("name"))
+        Attribute("type", highway)
       end
     end
+```
 
 Take a look at the supplied process.lua for a simple example, or the more complex OpenMapTiles-compatible script in `resources/`. You can specify another filename with the `--process` option.
 
@@ -197,11 +202,11 @@ When processing OSM objects with your Lua script, you can perform simple spatial
 
 You can then find out whether a node is within one of these polygons using the `Intersects` method:
 
-    if node:Intersects("countries") then print("Looks like it's on land"); end
+    if Intersects("countries") then print("Looks like it's on land"); end
 
 Or you can find out what country(/ies) the node is within using `FindIntersecting`, which returns a table:
 
-    names = node:FindIntersecting("countries")
+    names = FindIntersecting("countries")
     print(table.concat(name,","))
 
 To enable these functions, set `index` to true in your shapefile layer definition. `index_column` is not needed for `Intersects` but required for `FindIntersecting`.
diff --git a/docs/RELATIONS.md b/docs/RELATIONS.md
index 6e436b68..6fc3b557 100644
--- a/docs/RELATIONS.md
+++ b/docs/RELATIONS.md
@@ -22,26 +22,30 @@ This is a two-stage process: first, when reading relations, indicate that these
 
 To define which relations should be accepted, add a `relation_scan_function`:
 
-    function relation_scan_function(relation)
-      if relation:Find("type")=="route" and relation:Find("route")=="bicycle" then
-        local network = relation:Find("network")
-        if network=="ncn" then relation:Accept() end
+```lua
+    function relation_scan_function()
+      if Find("type")=="route" and Find("route")=="bicycle" then
+        local network = Find("network")
+        if network=="ncn" then Accept() end
       end
     end
+```
 
-This function takes the relation as its sole argument. Examine the tags using `relation:Find(key)` as normal. (You can also use `relation:Holds(key)` and `relation:Id()`.) If you want to use this relation, call `relation:Accept()`.
+Examine the tags using `Find(key)` as normal. (You can also use `Holds(key)` and `Id()`.) If you want to use this relation, call `Accept()`.
 
 #### Stage 2: accessing relations from ways
 
-Now that you've accepted the relations, they will be available from `way_function`. They are accessed using an iterator (`way:NextRelation()`) which reads each relation for that way in turn, returning nil when there are no more relations available. Once you have accessed a relation with the iterator, you can read its tags with `way:FindInRelation(key)`. For example:
+Now that you've accepted the relations, they will be available from `way_function`. They are accessed using an iterator (`NextRelation()`) which reads each relation for that way in turn, returning nil when there are no more relations available. Once you have accessed a relation with the iterator, you can read its tags with `FindInRelation(key)`. For example:
 
+```lua
     while true do
-      local rel = way:NextRelation()
+      local rel = NextRelation()
       if not rel then break end
-      print ("Part of route "..way:FindInRelation("ref"))
+      print ("Part of route "..FindInRelation("ref"))
     end
+```
 
-(Should you need to re-read the relations, you can reset the iterator with `way:RestartRelations()`.)
+(Should you need to re-read the relations, you can reset the iterator with `RestartRelations()`.)
 
 
 ### Writing relation geometries
@@ -52,13 +56,15 @@ First, make sure that you have accepted the relations using `relation_scan_funct
 
 Then write a `relation_function`, which works in the same way as `way_function` would:
 
-    function relation_function(relation)
-      if relation:Find("type")=="route" and relation:Find("route")=="bicycle" then
-        relation:Layer("bike_routes", false)
-        relation:Attribute("class", relation:Find("network"))
-        relation:Attribute("ref", relation:Find("ref"))
+```lua
+    function relation_function()
+      if Find("type")=="route" and Find("route")=="bicycle" then
+        Layer("bike_routes", false)
+        Attribute("class", Find("network"))
+        Attribute("ref", Find("ref"))
       end
     end
+```
 
 
 ### Not supported
diff --git a/include/append_vector.h b/include/append_vector.h
new file mode 100644
index 00000000..07531217
--- /dev/null
+++ b/include/append_vector.h
@@ -0,0 +1,195 @@
+#ifndef _APPEND_VECTOR_H
+#define _APPEND_VECTOR_H
+
+#include "mmap_allocator.h"
+#include <vector>
+#include <queue>
+
+// Tilemaker collects OutputObjects in a list that
+// - spills to disk
+// - only gets appended to
+//
+// Vector is great for linear access, but resizes cause expensive disk I/O to
+// copy elements.
+//
+// Deque is great for growing without disk I/O, but it allocates in blocks of 512,
+// which is inefficient for linear access.
+//
+// Instead, we author a limited vector-of-vectors class that allocates in bigger chunks,
+// to get the best of both worlds.
+
+#define APPEND_VECTOR_SIZE 8192
+namespace AppendVectorNS {
+	template <class T>
+	class AppendVector {
+	public:
+		struct Iterator {
+			using iterator_category = std::random_access_iterator_tag;
+			using difference_type   = std::ptrdiff_t;
+			using value_type        = T;
+			using pointer           = T*;
+			using reference         = T&;
+
+			Iterator(AppendVector<T>& appendVector, uint16_t vec, uint16_t offset):
+				appendVector(&appendVector), vec(vec), offset(offset) {}
+
+			Iterator():
+				appendVector(nullptr), vec(0), offset(0) {}
+
+
+			bool operator<(const Iterator& other) const {
+				if (vec < other.vec)
+					return true;
+
+				if (vec > other.vec)
+					return false;
+
+				return offset < other.offset;
+			}
+
+			bool operator>=(const Iterator& other) const {
+				return !(*this < other);
+			}
+
+			Iterator operator-(int delta) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute -= delta;
+				return Iterator(*appendVector, absolute / APPEND_VECTOR_SIZE, absolute % APPEND_VECTOR_SIZE);
+			}
+
+			Iterator operator+(int delta) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute += delta;
+				return Iterator(*appendVector, absolute / APPEND_VECTOR_SIZE, absolute % APPEND_VECTOR_SIZE);
+			}
+
+			bool operator==(const Iterator& other) const {
+				return appendVector == other.appendVector && vec == other.vec && offset == other.offset;
+			}
+
+			bool operator!=(const Iterator& other) const {
+				return !(*this == other);
+			}
+
+			std::ptrdiff_t operator-(const Iterator& other) const {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				int64_t otherAbsolute = other.vec * APPEND_VECTOR_SIZE + other.offset;
+
+				return absolute - otherAbsolute;
+			}
+
+			reference operator*() const {
+				auto& vector = appendVector->vecs[vec];
+				auto& el = vector[offset];
+				return el;
+			}
+
+			pointer operator->() const {
+				auto& vector = appendVector->vecs[vec];
+				auto& el = vector[offset];
+				return &el;
+			}
+
+			Iterator& operator+= (int delta) {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute += delta;
+
+				vec = absolute / APPEND_VECTOR_SIZE;
+				offset = absolute % APPEND_VECTOR_SIZE;
+				return *this;
+			}
+
+			Iterator& operator-= (int delta) {
+				int64_t absolute = vec * APPEND_VECTOR_SIZE + offset;
+				absolute -= delta;
+
+				vec = absolute / APPEND_VECTOR_SIZE;
+				offset = absolute % APPEND_VECTOR_SIZE;
+				return *this;
+			}
+
+			// Prefix increment
+			Iterator& operator++() {
+				offset++;
+				if (offset == APPEND_VECTOR_SIZE) {
+					offset = 0;
+					vec++;
+				}
+				return *this;
+			}  
+
+			// Postfix increment
+			Iterator operator++(int) { Iterator tmp = *this; ++(*this); return tmp; }
+
+			// Prefix decrement
+			Iterator& operator--() {
+				if (offset > 0) {
+					offset--;
+				} else {
+					vec--;
+					offset = APPEND_VECTOR_SIZE - 1;
+				}
+
+				return *this;
+			}
+
+			// Postfix decrement
+			Iterator operator--(int) { Iterator tmp = *this; --(*this); return tmp; }
+
+		private:
+			mutable AppendVector<T>* appendVector;
+			int32_t vec, offset;
+		};
+
+		AppendVector():
+			count(0),
+			vecs(1) {
+		}
+
+		void clear() {
+			count = 0;
+			vecs.clear();
+			vecs.push_back(std::vector<T, mmap_allocator<T>>());
+			vecs.back().reserve(APPEND_VECTOR_SIZE);
+		}
+
+		size_t size() const {
+			return count;
+		}
+
+		T& operator [](int idx) {
+			auto& vec = vecs[idx / APPEND_VECTOR_SIZE];
+			auto& el = vec[idx % APPEND_VECTOR_SIZE];
+			return el;
+		}
+
+		Iterator begin() {
+			return Iterator(*this, 0, 0);
+		}
+
+		Iterator end() {
+			return Iterator(*this, vecs.size() - 1, count % APPEND_VECTOR_SIZE);
+		}
+
+		void push_back(const T& el) {
+			if (vecs.back().capacity() == 0)
+				vecs.back().reserve(APPEND_VECTOR_SIZE);
+
+			vecs.back().push_back(el);
+
+			if (vecs.back().size() == vecs.back().capacity()) {
+				vecs.push_back(std::vector<T, mmap_allocator<T>>());
+				vecs.back().reserve(APPEND_VECTOR_SIZE);
+			}
+
+			count++;
+		}
+
+		size_t count;
+		std::deque<std::vector<T, mmap_allocator<T>>> vecs;
+	};
+}
+
+#undef APPEND_VECTOR_SIZE
+
+#endif
diff --git a/include/attribute_store.h b/include/attribute_store.h
index ad1aa4e1..6f11ba00 100644
--- a/include/attribute_store.h
+++ b/include/attribute_store.h
@@ -10,6 +10,8 @@
 #include <boost/functional/hash.hpp>
 #include <boost/container/flat_map.hpp>
 #include <vector>
+#include "pooled_string.h"
+#include "deque_map.h"
 
 /* AttributeStore - global dictionary for attributes */
 
@@ -39,26 +41,67 @@ class AttributeKeyStore {
 	std::map<const std::string*, uint16_t, string_ptr_less_than> keys2index;
 };
 
-enum class AttributePairType: char { False = 0, True = 1, Float = 2, String = 3 };
+enum class AttributePairType: char { Bool = 0, Float = 1, String = 2 };
 // AttributePair is a key/value pair (with minzoom)
+#pragma pack(push, 1)
 struct AttributePair {
-	std::string stringValue_;
-	float floatValue_;
-	short keyIndex;
-	char minzoom;
-	AttributePairType valueType;
+	short keyIndex : 9;
+	AttributePairType valueType : 3;
+	char minzoom : 4;
+	union {
+		float floatValue_;
+		PooledString stringValue_;
+	};
 
 	AttributePair(uint32_t keyIndex, bool value, char minzoom)
-		: keyIndex(keyIndex), valueType(value ? AttributePairType::True : AttributePairType::False), minzoom(minzoom)
+		: keyIndex(keyIndex), valueType(AttributePairType::Bool), minzoom(minzoom), floatValue_(value ? 1 : 0)
 	{
 	}
-	AttributePair(uint32_t keyIndex, const std::string& value, char minzoom)
+	AttributePair(uint32_t keyIndex, const PooledString& value, char minzoom)
 		: keyIndex(keyIndex), valueType(AttributePairType::String), stringValue_(value), minzoom(minzoom)
 	{
 	}
 	AttributePair(uint32_t keyIndex, float value, char minzoom)
-		: keyIndex(keyIndex), valueType(AttributePairType::Float), floatValue_(value), minzoom(minzoom)
+		: keyIndex(keyIndex), valueType(AttributePairType::Float), minzoom(minzoom), floatValue_(value)
+	{
+	}
+
+	AttributePair(const AttributePair& other):
+		keyIndex(other.keyIndex), valueType(other.valueType), minzoom(other.minzoom)
 	{
+		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float) {
+			floatValue_ = other.floatValue_;
+			return;
+		}
+
+		stringValue_ = other.stringValue_;
+	}
+
+	AttributePair& operator=(const AttributePair& other) {
+		keyIndex = other.keyIndex;
+		valueType = other.valueType;
+		minzoom = other.minzoom;
+
+		if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float) {
+			floatValue_ = other.floatValue_;
+			return *this;
+		}
+
+		stringValue_ = other.stringValue_;
+		return *this;
+	}
+
+	bool operator<(const AttributePair& other) const {
+		if (minzoom != other.minzoom)
+			return minzoom < other.minzoom;
+		if (keyIndex != other.keyIndex)
+			return keyIndex < other.keyIndex;
+		if (valueType != other.valueType) return valueType < other.valueType;
+
+		if (hasStringValue()) return pooledString() < other.pooledString();
+		if (hasBoolValue()) return boolValue() < other.boolValue();
+		if (hasFloatValue()) return floatValue() < other.floatValue();
+		throw std::runtime_error("Invalid type in attribute store");
 	}
 
 	bool operator==(const AttributePair &other) const {
@@ -66,7 +109,7 @@ struct AttributePair {
 		if (valueType == AttributePairType::String)
 			return stringValue_ == other.stringValue_;
 
-		if (valueType == AttributePairType::Float)
+		if (valueType == AttributePairType::Float || valueType == AttributePairType::Bool)
 			return floatValue_ == other.floatValue_;
 
 		return true;
@@ -74,13 +117,16 @@ struct AttributePair {
 
 	bool hasStringValue() const { return valueType == AttributePairType::String; }
 	bool hasFloatValue() const { return valueType == AttributePairType::Float; }
-	bool hasBoolValue() const { return valueType == AttributePairType::True || valueType == AttributePairType::False; };
+	bool hasBoolValue() const { return valueType == AttributePairType::Bool; }
 
-	const std::string& stringValue() const { return stringValue_; }
+	const PooledString& pooledString() const { return stringValue_; }
+	const std::string stringValue() const { return stringValue_.toString(); }
 	float floatValue() const { return floatValue_; }
-	bool boolValue() const { return valueType == AttributePairType::True; }
+	bool boolValue() const { return floatValue_; }
 
-	static bool isHot(const AttributePair& pair, const std::string& keyName) {
+	void ensureStringIsOwned();
+
+	static bool isHot(const std::string& keyName, const std::string& value) {
 		// Is this pair a candidate for the hot pool?
 
 		// Hot pairs are pairs that we think are likely to be re-used, like
@@ -89,25 +135,11 @@ struct AttributePair {
 		// The trick is that we commit to putting them in the hot pool
 		// before we know if we were right.
 
-		// All boolean pairs are eligible.
-		if (pair.hasBoolValue())
-			return true;
-
-		// Small integers are eligible.
-		if (pair.hasFloatValue()) {
-			float v = pair.floatValue();
-
-			if (ceil(v) == v && v >= 0 && v <= 25)
-				return true;
-		}
-
-		// The remaining things should be strings, but just in case...
-		if (!pair.hasStringValue())
-			return false;
+		// The rules for floats/booleans are managed in their addAttribute call.
 
 		// Only strings that are IDish are eligible: only lowercase letters.
 		bool ok = true;
-		for (const auto& c: pair.stringValue()) {
+		for (const auto& c: value) {
 			if (c != '-' && c != '_' && (c < 'a' || c > 'z'))
 				return false;
 		}
@@ -124,9 +156,10 @@ struct AttributePair {
 		boost::hash_combine(rv, keyIndex);
 		boost::hash_combine(rv, valueType);
 
-		if(hasStringValue())
-			boost::hash_combine(rv, stringValue());
-		else if(hasFloatValue())
+		if(hasStringValue()) {
+			const char* data = pooledString().data();
+			boost::hash_range(rv, data, data + pooledString().size());
+		} else if(hasFloatValue())
 			boost::hash_combine(rv, floatValue());
 		else if(hasBoolValue())
 			boost::hash_combine(rv, boolValue());
@@ -137,6 +170,7 @@ struct AttributePair {
 		return rv;
 	}
 };
+#pragma pack(pop)
 
 
 // We shard the cold pools to reduce the odds of lock contention on
@@ -149,46 +183,32 @@ struct AttributePair {
 #define SHARD_BITS 14
 #define ATTRIBUTE_SHARDS (1 << SHARD_BITS)
 
+class AttributeStore;
 class AttributePairStore {
 public:
 	AttributePairStore():
 		finalized(false),
-		pairs(ATTRIBUTE_SHARDS),
-		pairsMaps(ATTRIBUTE_SHARDS),
 		pairsMutex(ATTRIBUTE_SHARDS),
-		hotShardSize(0)
+		lookups(0),
+		lookupsUncached(0)
 	{
-		// NB: the hot shard is stored in its own, pre-allocated vector.
-		// pairs[0] is _not_ the hot shard
-		hotShard.reserve(1 << 16);
-		for (size_t i = 0; i < 1 << 16; i++)
-			hotShard.push_back(AttributePair(0, false, 0));
+		// The "hot" shard has a capacity of 64K, the others are unbounded.
+		pairs.push_back(DequeMap<AttributePair>(1 << 16));
+		// Reserve offset 0 as a sentinel
+		pairs[0].add(AttributePair(0, false, 0));
+		for (size_t i = 1; i < ATTRIBUTE_SHARDS; i++)
+			pairs.push_back(DequeMap<AttributePair>());
 	}
 
 	void finalize() { finalized = true; }
 	const AttributePair& getPair(uint32_t i) const;
 	const AttributePair& getPairUnsafe(uint32_t i) const;
-	uint32_t addPair(const AttributePair& pair, bool isHot);
-
-	struct key_value_less_ptr {
-		bool operator()(AttributePair const* lhs, AttributePair const* rhs) const {            
-			if (lhs->minzoom != rhs->minzoom)
-				return lhs->minzoom < rhs->minzoom;
-			if (lhs->keyIndex != rhs->keyIndex)
-				return lhs->keyIndex < rhs->keyIndex;
-			if (lhs->valueType != rhs->valueType) return lhs->valueType < rhs->valueType;
-
-			if (lhs->hasStringValue()) return lhs->stringValue() < rhs->stringValue();
-			if (lhs->hasBoolValue()) return lhs->boolValue() < rhs->boolValue();
-			if (lhs->hasFloatValue()) return lhs->floatValue() < rhs->floatValue();
-			throw std::runtime_error("Invalid type in attribute store");
-		}
-	}; 
+	uint32_t addPair(AttributePair& pair, bool isHot);
 
-	std::vector<std::deque<AttributePair>> pairs;
-	std::vector<boost::container::flat_map<const AttributePair*, uint32_t, AttributePairStore::key_value_less_ptr>> pairsMaps;
 
 private:
+	friend class AttributeStore;
+	std::vector<DequeMap<AttributePair>> pairs;
 	bool finalized;
 	// We refer to all attribute pairs by index.
 	//
@@ -198,41 +218,39 @@ class AttributePairStore {
 	// we suspect will be popular. It only ever has 64KB items,
 	// so that we can reference it with a short.
 	mutable std::vector<std::mutex> pairsMutex;
-	std::atomic<uint32_t> hotShardSize;
-	std::vector<AttributePair> hotShard;
+	std::atomic<uint64_t> lookupsUncached;
+	std::atomic<uint64_t> lookups;
 };
 
 // AttributeSet is a set of AttributePairs
 // = the complete attributes for one object
 struct AttributeSet {
 
-	struct less_ptr {
-		bool operator()(const AttributeSet* lhs, const AttributeSet* rhs) const {            
-			if (lhs->useVector != rhs->useVector)
-				return lhs->useVector < rhs->useVector;
-
-			if (lhs->useVector) {
-				if (lhs->intValues.size() != rhs->intValues.size())
-					return lhs->intValues.size() < rhs->intValues.size();
-
-				for (int i = 0; i < lhs->intValues.size(); i++) {
-					if (lhs->intValues[i] != rhs->intValues[i]) {
-						return lhs->intValues[i] < rhs->intValues[i];
-					}
-				}
+	bool operator<(const AttributeSet& other) const {
+		if (useVector != other.useVector)
+			return useVector < other.useVector;
 
-				return false;
-			}
+		if (useVector) {
+			if (intValues.size() != other.intValues.size())
+				return intValues.size() < other.intValues.size();
 
-			for (int i = 0; i < sizeof(lhs->shortValues)/sizeof(lhs->shortValues[0]); i++) {
-				if (lhs->shortValues[i] != rhs->shortValues[i]) {
-					return lhs->shortValues[i] < rhs->shortValues[i];
+			for (int i = 0; i < intValues.size(); i++) {
+				if (intValues[i] != other.intValues[i]) {
+					return intValues[i] < other.intValues[i];
 				}
 			}
 
 			return false;
 		}
-	}; 
+
+		for (int i = 0; i < sizeof(shortValues)/sizeof(shortValues[0]); i++) {
+			if (shortValues[i] != other.shortValues[i]) {
+				return shortValues[i] < other.shortValues[i];
+			}
+		}
+
+		return false;
+	}
 
 	size_t hash() const {
 		// Values are in canonical form after finalizeSet is called, so
@@ -253,6 +271,7 @@ struct AttributeSet {
 		return idx;
 	}
 
+	bool operator!=(const AttributeSet& other) const { return !(*this == other); }
 	bool operator==(const AttributeSet &other) const {
 		// Equivalent if, for every value in values, there is a value in other.values
 		// whose pair is the same.
@@ -380,6 +399,8 @@ struct AttributeSet {
 struct AttributeStore {
 	AttributeIndex add(AttributeSet &attributes);
 	std::vector<const AttributePair*> getUnsafe(AttributeIndex index) const;
+	void reset(); // used for testing
+	size_t size() const;
 	void reportSize() const;
 	void finalize();
 
@@ -390,9 +411,9 @@ struct AttributeStore {
 	AttributeStore():
 		finalized(false),
 		sets(ATTRIBUTE_SHARDS),
-		setsMaps(ATTRIBUTE_SHARDS),
 		setsMutex(ATTRIBUTE_SHARDS),
-		lookups(0) {
+		lookups(0),
+		lookupsUncached(0) {
 	}
 
 	AttributeKeyStore keyStore;
@@ -400,11 +421,11 @@ struct AttributeStore {
 
 private:
 	bool finalized;
-	std::vector<std::deque<AttributeSet>> sets;
-	std::vector<boost::container::flat_map<const AttributeSet*, uint32_t, AttributeSet::less_ptr>> setsMaps;
+	std::vector<DequeMap<AttributeSet>> sets;
 	mutable std::vector<std::mutex> setsMutex;
 
 	mutable std::mutex mutex;
+	std::atomic<uint64_t> lookupsUncached;
 	std::atomic<uint64_t> lookups;
 };
 
diff --git a/include/deque_map.h b/include/deque_map.h
new file mode 100644
index 00000000..ea57f669
--- /dev/null
+++ b/include/deque_map.h
@@ -0,0 +1,132 @@
+#ifndef DEQUE_MAP_H
+#define DEQUE_MAP_H
+
+#include <algorithm>
+#include <boost/range/irange.hpp>
+#include <cstring>
+#include <deque>
+#include <vector>
+
+// A class which looks deep within the soul of some instance of
+// a class T and assigns it a number based on the order in which
+// it joined (or reminds it of its number).
+//
+// Used to translate an 8-byte pointer into a 4-byte ID that can be
+// used repeatedly.
+template <class T>
+class DequeMap {
+public:
+	DequeMap(): maxSize(0) {}
+	DequeMap(uint32_t maxSize): maxSize(maxSize) {}
+
+	bool full() const {
+		return maxSize != 0 && size() == maxSize;
+	}
+
+	// If `entry` is already in the map, return its index.
+	// Otherwise, if maxSize is `0`, or greater than the number of entries in the map,
+	// add the item and return its index.
+	// Otherwise, return -1.
+	int32_t add(const T& entry) {
+		// Search to see if we've already got this entry.
+		const auto offsets = boost::irange<uint32_t>(0, keys.size());
+		const auto it = std::lower_bound(
+			offsets.begin(),
+			offsets.end(),
+			entry,
+			[&](const auto &e, auto id) {
+				return objects.at(keys[e]) < id;
+			}
+		);
+
+		// We do, return its index.
+		if (it != offsets.end() && objects[keys[*it]] == entry)
+			return keys[*it];
+
+		if (maxSize > 0 && objects.size() >= maxSize)
+			return -1;
+
+		// We don't, so store it...
+		const uint32_t newIndex = objects.size();
+		objects.push_back(entry);
+
+		// ...and add its index to our keys vector.
+		const uint32_t keysOffset = it == offsets.end() ? offsets.size() : *it;
+
+		const uint32_t desiredSize = keys.size() + 1;
+
+		// Amortize growth
+		if (keys.capacity() < desiredSize)
+			keys.reserve(keys.capacity() * 1.5);
+
+		keys.resize(desiredSize);
+
+		// Unless we're adding to the end, we need to shuffle existing keys down
+		// to make room for our new index.
+		if (keysOffset != newIndex) {
+			std::memmove(&keys[keysOffset + 1], &keys[keysOffset], sizeof(uint32_t) * (keys.size() - 1 - keysOffset));
+		}
+
+		keys[keysOffset] = newIndex;
+		return newIndex;
+	}
+
+	void clear() {
+		objects.clear();
+		keys.clear();
+	}
+
+	// Returns the index of `entry` if present, -1 otherwise.
+	int32_t find(const T& entry) const {
+		const auto offsets = boost::irange<uint32_t>(0, keys.size());
+		const auto it = std::lower_bound(
+			offsets.begin(),
+			offsets.end(),
+			entry,
+			[&](const auto &e, auto id) {
+				return objects.at(keys[e]) < id;
+			}
+		);
+
+		// We do, return its index.
+		if (it != offsets.end() && objects[keys[*it]] == entry)
+			return keys[*it];
+
+		return -1;
+	}
+
+	inline const T& operator[](uint32_t index) const {
+		return objects[index];
+	}
+
+	inline const T& at(uint32_t index) const {
+		return objects.at(index);
+	}
+
+	size_t size() const { return objects.size(); }
+
+	struct iterator {
+		const DequeMap<T>& dm;
+		size_t offset;
+		iterator(const DequeMap<T>& dm, size_t offset): dm(dm), offset(offset) {}
+		void operator++() { offset++; }
+		bool operator!=(iterator& other) { return offset != other.offset; }
+		const T& operator*() const { return dm.objects[dm.keys[offset]]; }
+	};
+
+	iterator begin() const { return iterator{*this, 0}; }
+	iterator end() const { return iterator{*this, keys.size()}; }
+
+private:
+	uint32_t maxSize;
+
+	// Using a deque is necessary, as it provides pointer-stability for previously
+	// added objects when it grows the storage (as opposed to, e.g., vector).
+	std::deque<T> objects;
+
+	// Whereas `objects` is ordered by insertion-time, `keys` is sorted such that
+	// objects[key[0]] < objects[key[1]] < ... < objects[key[$]]
+	// operator< of T.
+	std::vector<uint32_t> keys;
+};
+#endif
diff --git a/include/helpers.h b/include/helpers.h
index 7cb9c027..de490874 100644
--- a/include/helpers.h
+++ b/include/helpers.h
@@ -3,7 +3,8 @@
 #define _HELPERS_H
 
 #include <zlib.h>
-#include "geom.h"
+#include <sstream>
+#include <vector>
 
 // General helper routines
 
@@ -27,12 +28,11 @@ inline std::vector<std::string> split_string(std::string &inputStr, char sep) {
 	return res;
 }
 
+void decompress_string(std::string& output, const char* input, uint32_t inputSize, bool asGzip = false);
 double bboxElementFromStr(const std::string& number);
 
 std::vector<std::string> parseBox(const std::string& bbox);
 
-std::string decompress_string(const std::string& str, bool asGzip = false);
-
 std::string compress_string(const std::string& str,
                             int compressionlevel = Z_DEFAULT_COMPRESSION,
                             bool asGzip = false);
diff --git a/include/node_store.h b/include/node_store.h
index cc84aba2..76fe18b3 100644
--- a/include/node_store.h
+++ b/include/node_store.h
@@ -23,6 +23,11 @@ class NodeStore
 	// Accessors
 	virtual size_t size() const = 0;
 	virtual LatpLon at(NodeID i) const = 0;
+
+	virtual bool contains(size_t shard, NodeID id) const = 0;
+	virtual NodeStore& shard(size_t shard) = 0;
+	virtual const NodeStore& shard(size_t shard) const = 0;
+	virtual size_t shards() const = 0;
 };
 
 #endif
diff --git a/include/node_stores.h b/include/node_stores.h
index c5151bec..05d00f4e 100644
--- a/include/node_stores.h
+++ b/include/node_stores.h
@@ -5,6 +5,7 @@
 #include <memory>
 #include "node_store.h"
 #include "sorted_node_store.h"
+#include "sharded_node_store.h"
 #include "mmap_allocator.h"
 
 class BinarySearchNodeStore : public NodeStore
@@ -19,10 +20,16 @@ class BinarySearchNodeStore : public NodeStore
 	LatpLon at(NodeID i) const override;
 	size_t size() const override;
 	void insert(const std::vector<element_t>& elements) override;
-	void clear() { 
+	void clear() override {
 		reopen();
 	}
-	void batchStart() {}
+	void batchStart() override {}
+
+	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
+	
 
 private: 
 	mutable std::mutex mutex;
@@ -49,7 +56,14 @@ class CompactNodeStore : public NodeStore
 	void insert(const std::vector<element_t>& elements) override;
 	void clear() override;
 	void finalize(size_t numThreads) override {}
-	void batchStart() {}
+	void batchStart() override {}
+
+	// CompactNodeStore has no metadata to know whether or not it contains
+	// a node, so it's not suitable for used in sharded scenarios.
+	bool contains(size_t shard, NodeID id) const override { return true; }
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
 
 private: 
 	// @brief Insert a latp/lon pair.
diff --git a/include/options_parser.h b/include/options_parser.h
new file mode 100644
index 00000000..3ca73785
--- /dev/null
+++ b/include/options_parser.h
@@ -0,0 +1,58 @@
+#ifndef OPTIONS_PARSER_H
+#define OPTIONS_PARSER_H
+
+#include <exception>
+#include <string>
+#include <vector>
+
+namespace OptionsParser {
+	struct OptionException : std::exception {
+		OptionException(std::string message): message(message) {}
+
+		/// Returns the explanatory string.
+		const char* what() const noexcept override {
+				return message.data();
+		}
+
+		private:
+			std::string message;
+	};
+
+	enum class OutputMode: char { File = 0, MBTiles = 1, PMTiles = 2 };
+
+	struct OsmOptions {
+		std::string storeFile;
+		bool fast = false;
+		bool compact = false;
+		bool skipIntegrity = false;
+		bool uncompressedNodes = false;
+		bool uncompressedWays = false;
+		bool materializeGeometries = false;
+		// lazyGeometries is the inverse of materializeGeometries. It can be passed
+		// to override an implicit materializeGeometries, as in the non-store case.
+		bool lazyGeometries = false;
+		bool shardStores = false;
+	};
+
+	struct Options {
+		std::vector<std::string> inputFiles;
+		std::string luaFile;
+		std::string jsonFile;
+		uint32_t threadNum = 0;
+		std::string outputFile;
+		std::string bbox;
+
+		OsmOptions osm;
+		bool showHelp = false;
+		bool verbose = false;
+		bool mergeSqlite = false;
+		bool mapsplit = false;
+		OutputMode outputMode = OutputMode::File;
+		bool logTileTimings = false;
+	};
+
+	Options parse(const int argc, const char* argv[]);
+	void showHelp();
+};
+
+#endif
diff --git a/include/osm_lua_processing.h b/include/osm_lua_processing.h
index b646bc2e..6a6a1d5d 100644
--- a/include/osm_lua_processing.h
+++ b/include/osm_lua_processing.h
@@ -13,9 +13,12 @@
 #include "shp_mem_tiles.h"
 #include "osm_mem_tiles.h"
 #include "helpers.h"
+#include <protozero/data_view.hpp>
 
 #include <boost/container/flat_map.hpp>
 
+class TagMap;
+
 // Lua
 extern "C" {
 	#include "lua.h"
@@ -31,6 +34,20 @@ extern bool verbose;
 class AttributeStore;
 class AttributeSet;
 
+// A string, which might be in `currentTags` as a value. If Lua
+// code refers to an absent value, it'll fallback to passing
+// it as a std::string.
+//
+// The intent is that Attribute("name", Find("name")) is a common
+// pattern, and we ought to avoid marshalling a string back and
+// forth from C++ to Lua when possible.
+struct PossiblyKnownTagValue {
+	bool found;
+	uint32_t index;
+	std::string fallback;
+};
+
+
 /**
 	\brief OsmLuaProcessing - converts OSM objects into OutputObjects.
 	
@@ -71,34 +88,28 @@ class OsmLuaProcessing {
 
 	// ----	Data loading methods
 
-	using tag_map_t = boost::container::flat_map<std::string, std::string>;
+	using tag_map_t = boost::container::flat_map<protozero::data_view, protozero::data_view, DataViewLessThan>;
 
 	// Scan non-MP relation
-	bool scanRelation(WayID id, const tag_map_t &tags);
+	bool scanRelation(WayID id, const TagMap& tags);
 
 	/// \brief We are now processing a significant node
-	void setNode(NodeID id, LatpLon node, const tag_map_t &tags);
+	void setNode(NodeID id, LatpLon node, const TagMap& tags);
 
 	/// \brief We are now processing a way
-	bool setWay(WayID wayId, LatpLonVec const &llVec, const tag_map_t &tags);
+	bool setWay(WayID wayId, LatpLonVec const &llVec, const TagMap& tags);
 
 	/** \brief We are now processing a relation
 	 * (note that we store relations as ways with artificial IDs, and that
 	 *  we use decrementing positive IDs to give a bit more space for way IDs)
 	 */
-	void setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const tag_map_t &tags, bool isNativeMP, bool isInnerOuter);
+	void setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const TagMap& tags, bool isNativeMP, bool isInnerOuter);
 
 	// ----	Metadata queries called from Lua
 
 	// Get the ID of the current object
 	std::string Id() const;
 
-	// Check if there's a value for a given key
-	bool Holds(const std::string& key) const;
-
-	// Get an OSM tag for a given key (or return empty string if none)
-	const std::string& Find(const std::string& key) const;
-
 	// ----	Spatial queries called from Lua
 
 	// Find intersecting shapefile layer
@@ -160,11 +171,8 @@ class OsmLuaProcessing {
 	void LayerAsCentroid(const std::string &layerName);
 	
 	// Set attributes in a vector tile's Attributes table
-	void Attribute(const std::string &key, const std::string &val);
-	void AttributeWithMinZoom(const std::string &key, const std::string &val, const char minzoom);
-	void AttributeNumeric(const std::string &key, const float val);
+	void AttributeWithMinZoom(const std::string &key, const PossiblyKnownTagValue& val, const char minzoom);
 	void AttributeNumericWithMinZoom(const std::string &key, const float val, const char minzoom);
-	void AttributeBoolean(const std::string &key, const bool val);
 	void AttributeBooleanWithMinZoom(const std::string &key, const bool val, const char minzoom);
 	void MinZoom(const double z);
 	void ZOrder(const double z);
@@ -199,6 +207,7 @@ class OsmLuaProcessing {
 	inline AttributeStore &getAttributeStore() { return attributeStore; }
 
 	struct luaProcessingException :std::exception {};
+	const TagMap* currentTags;
 
 private:
 	/// Internal: clear current cached state
@@ -216,6 +225,8 @@ class OsmLuaProcessing {
 		lastStoredGeometryId = 0;
 	}
 
+	void removeAttributeIfNeeded(const std::string& key);
+
 	const inline Point getPoint() {
 		return Point(lon/10000000.0,latp/10000000.0);
 	}
@@ -258,7 +269,7 @@ class OsmLuaProcessing {
 	class LayerDefinition &layers;
 
 	std::vector<std::pair<OutputObject, AttributeSet>> outputs;		// All output objects that have been created
-	const boost::container::flat_map<std::string, std::string>* currentTags;
+	std::vector<std::string> outputKeys;
 
 	std::vector<OutputObject> finalizeOutputs();
 
diff --git a/include/osm_mem_tiles.h b/include/osm_mem_tiles.h
index a6266ea3..3c920b08 100644
--- a/include/osm_mem_tiles.h
+++ b/include/osm_mem_tiles.h
@@ -6,10 +6,15 @@
 #include "osm_store.h"
 #include "geometry_cache.h"
 
-#define OSM_THRESHOLD (1ull << 35)
-#define USE_WAY_STORE (1ull << 35)
-#define IS_WAY(x) (((x) >> 35) == (USE_WAY_STORE >> 35))
-#define OSM_ID(x) ((x) & 0b111111111111111111111111111111111)
+// NB: Currently, USE_NODE_STORE and USE_WAY_STORE are equivalent.
+// If we permit LayerAsCentroid to be generated from the OSM stores,
+// this will have to change.
+#define OSM_THRESHOLD (1ull << TILE_DATA_ID_SIZE)
+#define USE_NODE_STORE (2ull << TILE_DATA_ID_SIZE)
+#define IS_NODE(x) (((x) >> TILE_DATA_ID_SIZE) == (USE_NODE_STORE >> TILE_DATA_ID_SIZE))
+#define USE_WAY_STORE (1ull << TILE_DATA_ID_SIZE)
+#define IS_WAY(x) (((x) >> TILE_DATA_ID_SIZE) == (USE_WAY_STORE >> TILE_DATA_ID_SIZE))
+#define OSM_ID(x) ((x) & 0b1111111111111111111111111111111111)
 
 class NodeStore;
 class WayStore;
@@ -32,18 +37,21 @@ class OsmMemTiles : public TileDataSource {
 		const WayStore& wayStore
 	);
 
+	std::string name() const override { return "osm"; }
+
 	Geometry buildWayGeometry(
 		const OutputGeometryType geomType, 
 		const NodeID objectID,
 		const TileBbox &bbox
 	) override;
+	LatpLon buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const override;
 
 
 	void Clear();
 
 private:
-	void populateLinestring(Linestring& ls, NodeID objectID);
-	Linestring& getOrBuildLinestring(NodeID objectID);
+	void populateLinestring(Linestring& ls, NodeID objectID) const;
+	Linestring& getOrBuildLinestring(NodeID objectID) const;
 	void populateMultiPolygon(MultiPolygon& dst, NodeID objectID) override;
 
 	const NodeStore& nodeStore;
diff --git a/include/osm_store.h b/include/osm_store.h
index 11158bb2..5bb74272 100644
--- a/include/osm_store.h
+++ b/include/osm_store.h
@@ -11,12 +11,21 @@
 #include <mutex>
 #include <unordered_set>
 #include <boost/container/flat_map.hpp>
+#include <protozero/data_view.hpp>
 
 extern bool verbose;
 
 class NodeStore;
 class WayStore;
 
+// A comparator for data_view so it can be used in boost's flat_map
+struct DataViewLessThan {
+	bool operator()(const protozero::data_view& a, const protozero::data_view& b) const {
+		return a < b;
+	}
+};
+
+
 //
 // Internal data structures.
 //
@@ -72,37 +81,39 @@ class RelationScanStore {
 
 private:
 	using tag_map_t = boost::container::flat_map<std::string, std::string>;
-	std::map<WayID, std::vector<WayID>> relationsForWays;
-	std::map<WayID, tag_map_t> relationTags;
-	mutable std::mutex mutex;
+	std::vector<std::map<WayID, std::vector<WayID>>> relationsForWays;
+	std::vector<std::map<WayID, tag_map_t>> relationTags;
+	mutable std::vector<std::mutex> mutex;
 
 public:
+	RelationScanStore(): relationsForWays(128), relationTags(128), mutex(128) {}
 	void relation_contains_way(WayID relid, WayID wayid) {
-		std::lock_guard<std::mutex> lock(mutex);
-		relationsForWays[wayid].emplace_back(relid);
+		const size_t shard = wayid % mutex.size();
+
+		std::lock_guard<std::mutex> lock(mutex[shard]);
+		relationsForWays[shard][wayid].emplace_back(relid);
 	}
 	void store_relation_tags(WayID relid, const tag_map_t &tags) {
-		std::lock_guard<std::mutex> lock(mutex);
-		relationTags[relid] = tags;
+		const size_t shard = relid % mutex.size();
+		std::lock_guard<std::mutex> lock(mutex[shard]);
+		relationTags[shard][relid] = tags;
 	}
 	bool way_in_any_relations(WayID wayid) {
-		return relationsForWays.find(wayid) != relationsForWays.end();
+		const size_t shard = wayid % mutex.size();
+		return relationsForWays[shard].find(wayid) != relationsForWays[shard].end();
 	}
 	std::vector<WayID> relations_for_way(WayID wayid) {
-		return relationsForWays[wayid];
+		const size_t shard = wayid % mutex.size();
+		return relationsForWays[shard][wayid];
 	}
 	std::string get_relation_tag(WayID relid, const std::string &key) {
-		auto it = relationTags.find(relid);
-		if (it==relationTags.end()) return "";
+		const size_t shard = relid % mutex.size();
+		auto it = relationTags[shard].find(relid);
+		if (it==relationTags[shard].end()) return "";
 		auto jt = it->second.find(key);
 		if (jt==it->second.end()) return "";
 		return jt->second;
 	}
-	void clear() {
-		std::lock_guard<std::mutex> lock(mutex);
-		relationsForWays.clear();
-		relationTags.clear();
-	}
 };
 
 
diff --git a/include/osmformat.proto b/include/osmformat.proto
deleted file mode 100644
index 93060586..00000000
--- a/include/osmformat.proto
+++ /dev/null
@@ -1,226 +0,0 @@
-syntax = "proto2";
-
-option java_package = "crosby.binary";
-
-/* OSM Binary file format 
-
-This is the master schema file of the OSM binary file format. This
-file is designed to support limited random-access and future
-extendability.
-
-A binary OSM file consists of a sequence of FileBlocks (please see
-fileformat.proto). The first fileblock contains a serialized instance
-of HeaderBlock, followed by a sequence of PrimitiveBlock blocks that
-contain the primitives.
-
-Each primitiveblock is designed to be independently parsable. It
-contains a string table storing all strings in that block (keys and
-values in tags, roles in relations, usernames, etc.) as well as
-metadata containing the precision of coordinates or timestamps in that
-block.
-
-A primitiveblock contains a sequence of primitive groups, each
-containing primitives of the same type (nodes, densenodes, ways,
-relations). Coordinates are stored in signed 64-bit integers. Lat&lon
-are measured in units <granularity> nanodegrees. The default of
-granularity of 100 nanodegrees corresponds to about 1cm on the ground,
-and a full lat or lon fits into 32 bits.
-
-Converting an integer to a lattitude or longitude uses the formula:
-$OUT = IN * granularity / 10**9$. Many encoding schemes use delta
-coding when representing nodes and relations.
-
-*/
-
-/* Added */
-
-message BlobHeader {
-  required string type = 1;
-  optional bytes indexdata = 2;
-  required int32 datasize = 3;
-}
-message Blob {
-  optional bytes raw = 1; // No compression
-  optional int32 raw_size = 2; // Only set when compressed, to the uncompressed size
-  optional bytes zlib_data = 3;
-  // optional bytes lzma_data = 4; // PROPOSED.
-  // optional bytes OBSOLETE_bzip2_data = 5; // Deprecated.
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////
-
-/* Contains the file header. */
-
-message HeaderBlock {
-  optional HeaderBBox bbox = 1;
-  /* Additional tags to aid in parsing this dataset */
-  repeated string required_features = 4;
-  repeated string optional_features = 5;
-
-  optional string writingprogram = 16; 
-  optional string source = 17; // From the bbox field.
-}
-
-
-/** The bounding box field in the OSM header. BBOX, as used in the OSM
-header. Units are always in nanodegrees -- they do not obey
-granularity rules. */
-
-message HeaderBBox {
-   required sint64 left = 1;
-   required sint64 right = 2;
-   required sint64 top = 3;
-   required sint64 bottom = 4;
-}
-
-
-///////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-
-message PrimitiveBlock {
-  required StringTable stringtable = 1;
-  repeated PrimitiveGroup primitivegroup = 2;
-
-  // Granularity, units of nanodegrees, used to store coordinates in this block
-  optional int32 granularity = 17 [default=100]; 
-  // Offset value between the output coordinates coordinates and the granularity grid in unites of nanodegrees.
-  optional int64 lat_offset = 19 [default=0];
-  optional int64 lon_offset = 20 [default=0]; 
-
-// Granularity of dates, normally represented in units of milliseconds since the 1970 epoch.
-  optional int32 date_granularity = 18 [default=1000]; 
-
-
-  // Proposed extension:
-  //optional BBox bbox = 19;
-}
-
-// Group of OSMPrimitives. All primitives in a group must be the same type.
-message PrimitiveGroup {
-  repeated Node     nodes = 1;
-  optional DenseNodes dense = 2;
-  repeated Way      ways = 3;
-  repeated Relation relations = 4;
-  repeated ChangeSet changesets = 5;
-}
-
-
-/** String table, contains the common strings in each block.
-
- Note that we reserve index '0' as a delimiter, so the entry at that
- index in the table is ALWAYS blank and unused.
-
- */
-message StringTable {
-   repeated bytes s = 1;
-}
-
-/* Optional metadata that may be included into each primitive. */
-message Info {
-   optional int32 version = 1 [default = -1];
-   optional int32 timestamp = 2;
-   optional int64 changeset = 3;
-   optional int32 uid = 4;
-   optional int32 user_sid = 5; // String IDs
-}
-
-/** Optional metadata that may be included into each primitive. Special dense format used in DenseNodes. */
-message DenseInfo {
-   repeated int32 version = 1 [packed = true]; 
-   repeated sint64 timestamp = 2 [packed = true]; // DELTA coded
-   repeated sint64 changeset = 3 [packed = true]; // DELTA coded
-   repeated sint32 uid = 4 [packed = true]; // DELTA coded
-   repeated sint32 user_sid = 5 [packed = true]; // String IDs for usernames. DELTA coded
-}
-
-
-// TODO: REMOVE THIS? NOT in osmosis schema.
-message ChangeSet {
-   required int64 id = 1;
-   // Parallel arrays.
-   repeated uint32 keys = 2 [packed = true]; // String IDs.
-   repeated uint32 vals = 3 [packed = true]; // String IDs.
-
-   optional Info info = 4;
-
-   required int64 created_at = 8;
-   optional int64 closetime_delta = 9;
-   required bool open = 10;
-   optional HeaderBBox bbox = 11;
-}
-
-
-message Node {
-   required sint64 id = 1;
-   // Parallel arrays.
-   repeated uint32 keys = 2 [packed = true]; // String IDs.
-   repeated uint32 vals = 3 [packed = true]; // String IDs.
-
-   optional Info info = 4; // May be omitted in omitmeta
-
-   required sint64 lat = 8;
-   required sint64 lon = 9;
-}
-
-/* Used to densly represent a sequence of nodes that do not have any tags.
-
-We represent these nodes columnwise as five columns: ID's, lats, and
-lons, all delta coded. When metadata is not omitted, 
-
-We encode keys & vals for all nodes as a single array of integers
-containing key-stringid and val-stringid, using a stringid of 0 as a
-delimiter between nodes.
-
-   ( (<keyid> <valid>)* '0' )*
- */
-
-message DenseNodes {
-   repeated sint64 id = 1 [packed = true]; // DELTA coded
-
-   //repeated Info info = 4;
-   optional DenseInfo denseinfo = 5;
-
-   repeated sint64 lat = 8 [packed = true]; // DELTA coded
-   repeated sint64 lon = 9 [packed = true]; // DELTA coded
-
-   // Special packing of keys and vals into one array. May be empty if all nodes in this block are tagless.
-   repeated int32 keys_vals = 10 [packed = true]; 
-}
-
-
-message Way {
-   required int64 id = 1;
-   // Parallel arrays.
-   repeated uint32 keys = 2 [packed = true];
-   repeated uint32 vals = 3 [packed = true];
-
-   optional Info info = 4;
-
-   repeated sint64 refs = 8 [packed = true];  // DELTA coded
-   repeated sint64 lats = 9  [packed = true];
-   repeated sint64 lons = 10 [packed = true];
-}
-
-message Relation {
-  enum MemberType {
-    NODE = 0;
-    WAY = 1;
-    RELATION = 2;
-  } 
-   required int64 id = 1;
-
-   // Parallel arrays.
-   repeated uint32 keys = 2 [packed = true];
-   repeated uint32 vals = 3 [packed = true];
-
-   optional Info info = 4;
-
-   // Parallel arrays
-   repeated int32 roles_sid = 8 [packed = true];
-   repeated sint64 memids = 9 [packed = true]; // DELTA encoded
-   repeated MemberType types = 10 [packed = true];
-}
-
diff --git a/include/output_object.h b/include/output_object.h
index 3d2d862e..9afd5cba 100644
--- a/include/output_object.h
+++ b/include/output_object.h
@@ -12,7 +12,6 @@
 #include "osm_store.h"
 
 // Protobuf
-#include "osmformat.pb.h"
 #include "vector_tile.pb.h"
 
 enum OutputGeometryType : unsigned int { POINT_, LINESTRING_, MULTILINESTRING_, POLYGON_ };
@@ -22,9 +21,6 @@ std::ostream& operator<<(std::ostream& os, OutputGeometryType geomType);
 
 /**
  * \brief OutputObject - any object (node, linestring, polygon) to be outputted to tiles
-
- * Possible future improvements to save memory:
- * - use a global dictionary for attribute key/values
 */
 #pragma pack(push, 4)
 class OutputObject {
diff --git a/include/pbf_blocks.h b/include/pbf_blocks.h
deleted file mode 100644
index 5cc28969..00000000
--- a/include/pbf_blocks.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*! \file */ 
-#ifndef _PBF_BLOCKS_H
-#define _PBF_BLOCKS_H
-
-#include <vector>
-#include <string>
-#include <map>
-#include <fstream>
-
-// Protobuf
-#include "osmformat.pb.h"
-#include "vector_tile.pb.h"
-
-/* -------------------
-   Protobuf handling
-   ------------------- */
-
-// Read and parse a protobuf message
-void readMessage(google::protobuf::Message *message, std::istream &input, unsigned int size);
-
-// Read an osm.pbf sequence of header length -> BlobHeader -> Blob
-// and parse the unzipped contents into a message
-BlobHeader readHeader(std::istream &input);
-void readBlock(google::protobuf::Message *messagePtr, std::size_t datasize, std::istream &input);
-
-void writeBlock(google::protobuf::Message *messagePtr, std::ostream &output, std::string headerType);
-/* -------------------
-   Tag handling
-   ------------------- */
-
-// Populate an array with the contents of a StringTable
-void readStringTable(std::vector<std::string> *strPtr, PrimitiveBlock *pbPtr);
-
-/// Populate a map with the reverse contents of a StringTable (i.e. string->num)
-void readStringMap(std::map<std::string, int> *mapPtr, PrimitiveBlock *pbPtr);
-
-/// Read the tags for a way into a hash
-/// requires strings array to have been populated by readStringTable
-std::map<std::string, std::string> getTags(std::vector<std::string> *strPtr, Way *wayPtr);
-
-/// Find the index of a string in the StringTable, adding it if it's not there
-unsigned int findStringInTable(std::string *strPtr, std::map<std::string, int> *mapPtr, PrimitiveBlock *pbPtr);
-
-/// Set a tag for a way to a new value
-void setTag(Way *wayPtr, unsigned int keyIndex, unsigned int valueIndex);
-
-#endif //_PBF_BLOCKS_H
-
diff --git a/include/read_pbf.h b/include/pbf_processor.h
similarity index 56%
rename from include/read_pbf.h
rename to include/pbf_processor.h
index b934a563..691613c1 100644
--- a/include/read_pbf.h
+++ b/include/pbf_processor.h
@@ -8,10 +8,12 @@
 #include <mutex>
 #include <map>
 #include "osm_store.h"
+#include "pbf_reader.h"
+#include <protozero/data_view.hpp>
 
 // Protobuf
-#include "osmformat.pb.h"
 #include "vector_tile.pb.h"
+#include "tag_map.h"
 
 class OsmLuaProcessing;
 
@@ -42,33 +44,34 @@ struct IndexedBlockMetadata: BlockMetadata {
  *
  * The output class is typically OsmMemTiles, which is derived from OsmLuaProcessing
  */
-class PbfReader
+class PbfProcessor
 {
 public:	
 	enum class ReadPhase { Nodes = 1, Ways = 2, Relations = 4, RelationScan = 8 };
 
-	PbfReader(OSMStore &osmStore);
+	PbfProcessor(OSMStore &osmStore);
 
 	using pbfreader_generate_output = std::function< std::shared_ptr<OsmLuaProcessing> () >;
 	using pbfreader_generate_stream = std::function< std::shared_ptr<std::istream> () >;
 
 	int ReadPbfFile(
+		uint shards,
 		bool hasSortTypeThenID,
 		const std::unordered_set<std::string>& nodeKeys,
 		unsigned int threadNum,
 		const pbfreader_generate_stream& generate_stream,
-		const pbfreader_generate_output& generate_output
+		const pbfreader_generate_output& generate_output,
+		const NodeStore& nodeStore,
+		const WayStore& wayStore
 	);
 
 	// Read tags into a map from a way/node/relation
-	using tag_map_t = boost::container::flat_map<std::string, std::string>;
 	template<typename T>
-	void readTags(T &pbfObject, PrimitiveBlock const &pb, tag_map_t &tags) {
-		tags.reserve(pbfObject.keys_size());
-		auto keysPtr = pbfObject.mutable_keys();
-		auto valsPtr = pbfObject.mutable_vals();
-		for (uint n=0; n < pbfObject.keys_size(); n++) {
-			tags[pb.stringtable().s(keysPtr->Get(n))] = pb.stringtable().s(valsPtr->Get(n));
+	void readTags(T &pbfObject, PbfReader::PrimitiveBlock const &pb, TagMap& tags) {
+		for (uint n=0; n < pbfObject.keys.size(); n++) {
+			auto keyIndex = pbfObject.keys[n];
+			auto valueIndex = pbfObject.vals[n];
+			tags.addTag(pb.stringTable[keyIndex], pb.stringTable[valueIndex]);
 		}
 	}
 
@@ -79,29 +82,40 @@ class PbfReader
 		const BlockMetadata& blockMetadata,
 		const std::unordered_set<std::string>& nodeKeys,
 		bool locationsOnWays,
-		ReadPhase phase
+		ReadPhase phase,
+		uint shard,
+		uint effectiveShard
 	);
-	bool ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, const std::unordered_set<int> &nodeKeyPositions);
+	bool ReadNodes(OsmLuaProcessing& output, PbfReader::PrimitiveGroup& pg, const PbfReader::PrimitiveBlock& pb, const std::unordered_set<int>& nodeKeyPositions);
 
-	bool ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, bool locationsOnWays);
-	bool ScanRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb);
+	bool ReadWays(
+		OsmLuaProcessing& output,
+		PbfReader::PrimitiveGroup& pg,
+		const PbfReader::PrimitiveBlock& pb,
+		bool locationsOnWays,
+		uint shard,
+		uint effectiveShards
+	);
+	bool ScanRelations(OsmLuaProcessing& output, PbfReader::PrimitiveGroup& pg, const PbfReader::PrimitiveBlock& pb);
 	bool ReadRelations(
 		OsmLuaProcessing& output,
-		PrimitiveGroup& pg,
-		const PrimitiveBlock& pb,
-		const BlockMetadata& blockMetadata
+		PbfReader::PrimitiveGroup& pg,
+		const PbfReader::PrimitiveBlock& pb,
+		const BlockMetadata& blockMetadata,
+		uint shard,
+		uint effectiveShards
 	);
 
-	inline bool RelationIsType(Relation const &rel, int typeKey, int val) {
-		if (typeKey==-1 || val==-1) return false;
-		auto typeI = std::find(rel.keys().begin(), rel.keys().end(), typeKey);
-		if (typeI==rel.keys().end()) return false;
-		int typePos = typeI - rel.keys().begin();
-		return rel.vals().Get(typePos) == val;
+	inline bool relationIsType(const PbfReader::Relation& rel, int typeKey, int val) {
+		if (typeKey == -1 || val == -1) return false;
+		auto typeI = std::find(rel.keys.begin(), rel.keys.end(), typeKey);
+		if (typeI == rel.keys.end()) return false;
+		int typePos = typeI - rel.keys.begin();
+		return rel.vals[typePos] == val;
 	}
 
 	/// Find a string in the dictionary
-	static int findStringPosition(PrimitiveBlock const &pb, char const *str);
+	static int findStringPosition(const PbfReader::PrimitiveBlock& pb, const std::string& str);
 	
 	OSMStore &osmStore;
 	std::mutex ioMutex;
diff --git a/include/pbf_reader.h b/include/pbf_reader.h
new file mode 100644
index 00000000..9af930c5
--- /dev/null
+++ b/include/pbf_reader.h
@@ -0,0 +1,296 @@
+#ifndef _PBF_READER_H
+#define _PBF_READER_H
+
+#include <istream>
+#include <protozero/data_view.hpp>
+#include <protozero/pbf_message.hpp>
+#include <protozero/types.hpp>
+#include <set>
+#include <vector>
+
+namespace PbfReader {
+	namespace Schema {
+	// See https://wiki.openstreetmap.org/wiki/PBF_Format#Definition_of_the_OSMHeader_fileblock
+	// for more background on the PBF schema.
+		enum class BlobHeader : protozero::pbf_tag_type {
+			required_string_type = 1,
+			optional_bytes_indexdata = 2,
+			required_int32_datasize = 3
+		};
+
+		enum class Blob : protozero::pbf_tag_type {
+			optional_int32_raw_size = 2, // When compressed, the uncompressed size
+			oneof_data_bytes_raw = 1, // No compression
+			oneof_data_bytes_zlib_data = 3,
+			oneof_data_bytes_lzma_data = 4,
+			// Formerly used for bzip2 compressed data. Deprecated in 2010.
+			// bytes OBSOLETE_bzip2_data = 5 [deprecated=true]; // Don't reuse this tag number.
+			oneof_data_bytes_lz4_data = 6,
+			oneof_data_bytes_zstd_data = 7,
+		};
+
+		enum class HeaderBBox : protozero::pbf_tag_type {
+			// These units are always in nanodegrees, they don't obey granularity rules.
+			required_sint64_left = 1,
+			required_sint64_right = 2,
+			required_sint64_top = 3,
+			required_sint64_bottom = 4
+		};
+
+		enum class HeaderBlock : protozero::pbf_tag_type {
+			optional_HeaderBBox_bbox = 1,
+			repeated_string_optional_features = 5
+		};
+
+		enum class StringTable : protozero::pbf_tag_type {
+			repeated_bytes_s = 1
+		};
+
+		enum class PrimitiveBlock : protozero::pbf_tag_type {
+			required_StringTable_stringtable = 1,
+			repeated_PrimitiveGroup_primitivegroup = 2,
+			optional_int32_granularity = 17,
+			optional_int32_date_granularity = 18,
+			optional_int64_lat_offset = 19,
+			optional_int64_lon_offset = 20
+		};
+
+		enum class PrimitiveGroup : protozero::pbf_tag_type {
+			repeated_Node_nodes = 1,
+			optional_DenseNodes_dense = 2,
+			repeated_Way_ways = 3,
+			repeated_Relation_relations = 4,
+			repeated_ChangeSet_changesets = 5
+		};
+
+		enum class DenseNodes : protozero::pbf_tag_type {
+			repeated_sint64_id = 1,
+			repeated_sint64_lat = 8,
+			repeated_sint64_lon = 9,
+			repeated_int32_keys_vals = 10
+		};
+
+		enum class Way : protozero::pbf_tag_type {
+			required_int64_id = 1,
+			repeated_uint32_keys = 2,
+			repeated_uint32_vals = 3,
+			repeated_sint64_refs = 8,
+			repeated_sint64_lats = 9,
+			repeated_sint64_lons = 10
+		};
+
+		enum class Relation : protozero::pbf_tag_type {
+			required_int64_id = 1,
+			repeated_uint32_keys = 2,
+			repeated_uint32_vals = 3,
+			repeated_int32_roles_sid = 8,
+			repeated_sint64_memids = 9,
+			repeated_MemberType_types = 10
+		};
+	}
+
+	struct BlobHeader {
+		std::string type;
+		int32_t datasize;
+	};
+
+	struct HeaderBBox {
+		double minLon, maxLon, minLat, maxLat;
+	};
+
+	struct HeaderBlock {
+		bool hasBbox;
+		HeaderBBox bbox;
+		std::set<std::string> optionalFeatures;
+	};
+
+	enum class PrimitiveGroupType: char { Node = 1, DenseNodes = 2, Way = 3, Relation = 4, ChangeSet = 5};
+
+	struct DenseNodes {
+		struct Node {
+			uint64_t id;
+			int32_t lon;
+			int32_t lat;
+			uint32_t tagStart;
+			uint32_t tagEnd;
+		};
+
+		struct Iterator {
+			int32_t offset;
+			Node node;
+			DenseNodes& nodes;
+
+			bool operator!=(Iterator& other) const;
+			void operator++();
+			Node& operator*();
+		};
+
+		std::vector<uint64_t> ids;
+		std::vector<int32_t> lons;
+		std::vector<int32_t> lats;
+		std::vector<int32_t> tagStart;
+		std::vector<int32_t> tagEnd;
+		std::vector<int32_t> keyValues;
+		Iterator begin();
+		Iterator end();
+		bool empty();
+		void clear();
+		void readDenseNodes(protozero::data_view data);
+	};
+
+	struct Way {
+		uint64_t id;
+		std::vector<uint32_t> keys;
+		std::vector<uint32_t> vals;
+		std::vector<uint64_t> refs;
+		std::vector<int32_t> lats;
+		std::vector<int32_t> lons;
+	};
+
+	struct Relation {
+		enum MemberType: int { NODE = 0, WAY = 1, RELATION = 2 };
+		uint64_t id;
+		std::vector<uint32_t> keys;
+		std::vector<uint32_t> vals;
+		std::vector<uint64_t> memids;
+		std::vector<int32_t> roles_sid;
+		std::vector<int32_t> types;
+	};
+
+	class PrimitiveGroup;
+	struct Ways {
+		struct Iterator {
+			protozero::pbf_message<Schema::PrimitiveGroup> message;
+			int offset;
+			Way& way;
+
+			bool operator!=(Iterator& other) const;
+			void operator++();
+			PbfReader::Way& operator*();
+
+			private:
+			void readWay(protozero::data_view data);
+		};
+
+		Ways(PrimitiveGroup* pg, Way& way): pg(pg), way(way) {}
+		Iterator begin();
+		Iterator end();
+		bool empty();
+
+		private:
+		friend PrimitiveGroup;
+		PrimitiveGroup* pg;
+		Way& way;
+	};
+
+	struct Relations {
+		struct Iterator {
+			protozero::pbf_message<Schema::PrimitiveGroup> message;
+			int offset;
+			Relation& relation;
+
+			bool operator!=(Iterator& other) const;
+			void operator++();
+			PbfReader::Relation& operator*();
+
+			private:
+			void readRelation(protozero::data_view data);
+		};
+
+
+		Relations(PrimitiveGroup* pg, Relation& relation): pg(pg), relation(relation) {}
+		Iterator begin();
+		Iterator end();
+		bool empty();
+
+		private:
+		friend PrimitiveGroup;
+		PrimitiveGroup* pg;
+		Relation& relation;
+	};
+
+	struct PrimitiveGroup {
+		PrimitiveGroup(
+			protozero::data_view data,
+			DenseNodes& nodes,
+			Way& way,
+			Relation& relation
+		);
+		DenseNodes& nodes() const;
+		Ways& ways() const;
+		Relations& relations() const;
+		PrimitiveGroupType type() const;
+
+		int32_t translateNodeKeyValue(int32_t i) const;
+
+		// Only meant to be called by our iterator, not by client code.
+		void ensureData();
+		protozero::data_view getDataView();
+	private:
+		protozero::data_view data;
+		DenseNodes& denseNodes;
+		mutable Ways internalWays;
+		mutable Relations internalRelations;
+		PrimitiveGroupType internalType;
+		bool denseNodesInitialized;
+
+	};
+
+	class PbfReader;
+	struct PrimitiveBlock {
+		struct PrimitiveGroups {
+			struct Iterator {
+				int offset;
+				std::vector<PrimitiveGroup>* groups;
+
+				Iterator(): offset(0), groups(nullptr) {}
+				Iterator(int offset, std::vector<PrimitiveGroup>& groups): offset(offset), groups(&groups) {}
+				bool operator!=(Iterator& other) const;
+				void operator++();
+				PrimitiveGroup& operator*();
+			};
+
+
+			PrimitiveGroups(): groups(nullptr) {}
+			PrimitiveGroups(std::vector<PrimitiveGroup>& groups): groups(&groups) {}
+			Iterator begin();
+			Iterator end();
+
+			private:
+			std::vector<PrimitiveGroup>* groups;
+		};
+
+		std::vector<protozero::data_view> stringTable;
+		PrimitiveGroups& groups();
+
+		private:
+		friend PbfReader;
+		std::vector<PrimitiveGroup> internalGroups;
+		PrimitiveGroups groupsImpl;
+	};
+
+	// This is a little weird: we use a class only to get private storage
+	// for multiple PBF readers. Due to the way we plumb the input files
+	// elsewhere in the system, the readers don't own them, and are not
+	// responsible for closing them.
+	class PbfReader {
+	public:
+		BlobHeader readBlobHeader(std::istream& input);
+		protozero::data_view readBlob(int32_t datasize, std::istream& input);
+		HeaderBlock readHeaderBlock(protozero::data_view data);
+		HeaderBBox readHeaderBBox(protozero::data_view data);
+		PrimitiveBlock& readPrimitiveBlock(protozero::data_view data);
+		void readStringTable(protozero::data_view data, std::vector<protozero::data_view>& stringTable);
+		HeaderBlock readHeaderFromFile(std::istream& input);
+
+	private:
+		std::string blobStorage; // the blob as stored in the PBF
+		std::string blobStorage2; // the blob after decompression, if needed
+		PrimitiveBlock pb;
+		DenseNodes denseNodes;
+		Way way;
+		Relation relation;
+	};
+}
+
+#endif
diff --git a/include/pooled_string.h b/include/pooled_string.h
new file mode 100644
index 00000000..56d44453
--- /dev/null
+++ b/include/pooled_string.h
@@ -0,0 +1,61 @@
+#ifndef _POOLED_STRING_H
+#define _POOLED_STRING_H
+
+// std::string is quite general:
+// - mutable
+// - unlimited length
+// - capacity can differ from size
+// - can deallocate its dynamic memory
+//
+// Our use case, by contrast is immutable, bounded strings that live for the
+// duration of the process.
+//
+// This gives us some room to have less memory overhead, especially on
+// g++, whose implementation of std::string requires 32 bytes.
+//
+// Thus, we implement `PooledString`. It has a size of 16 bytes, and a small
+// string optimization for strings <= 15 bytes. (We will separately teach
+// AttributePair to encode Latin-character strings more efficiently, so that many
+// strings of size 24 or less fit in 15 bytes.)
+//
+// If it needs to allocate memory, it does so from a shared pool. It is unable
+// to free the memory once allocated.
+
+// PooledString has one of three modes:
+// - [126:127] = 00: small-string, length is in [120:125], lower 15 bytes are string
+// - [126:127] = 10: pooled string, table is in bytes 1..3, offset in bytes 4..5, length in bytes 6..7
+// - [126:127] = 11: pointer to std::string, pointer is in bytes 8..15
+//
+// Note that the pointer mode is not safe to be stored. It exists just to allow
+// lookups in the AttributePair map before deciding to allocate a string.
+
+#include <vector>
+#include <string>
+
+namespace PooledStringNS {
+  class PooledString {
+    public:
+      // Create a short string or heap string, long-lived.
+      PooledString(const std::string& str);
+
+
+      // Create a std string - only valid so long as the string that is
+      // pointed to is valid.
+      PooledString(const std::string* str);
+      size_t size() const;
+      bool operator<(const PooledString& other) const;
+      bool operator==(const PooledString& other) const;
+      bool operator!=(const PooledString& other) const;
+      std::string toString() const;
+      const char* data() const;
+      void ensureStringIsOwned();
+
+    private:
+      // 0..3 is index into table, 4..5 is offset, 6..7 is length
+      uint8_t storage[16];
+  };
+}
+
+using PooledString = PooledStringNS::PooledString;
+
+#endif
diff --git a/include/protozero/basic_pbf_builder.hpp b/include/protozero/basic_pbf_builder.hpp
new file mode 100644
index 00000000..0ede726f
--- /dev/null
+++ b/include/protozero/basic_pbf_builder.hpp
@@ -0,0 +1,266 @@
+#ifndef PROTOZERO_BASIC_PBF_BUILDER_HPP
+#define PROTOZERO_BASIC_PBF_BUILDER_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file basic_pbf_builder.hpp
+ *
+ * @brief Contains the basic_pbf_builder template class.
+ */
+
+#include "basic_pbf_writer.hpp"
+#include "types.hpp"
+
+#include <type_traits>
+
+namespace protozero {
+
+/**
+ * The basic_pbf_builder is used to write PBF formatted messages into a buffer.
+ * It is based on the basic_pbf_writer class and has all the same methods. The
+ * difference is that while the pbf_writer class takes an integer tag,
+ * this template class takes a tag of the template type T. The idea is that
+ * T will be an enumeration value and this helps reduce the possibility of
+ * programming errors.
+ *
+ * Almost all methods in this class can throw an std::bad_alloc exception if
+ * the underlying buffer class wants to resize.
+ *
+ * Read the tutorial to understand how this class is used. In most cases you
+ * want to use the pbf_builder class which uses a std::string as buffer type.
+ */
+template <typename TBuffer, typename T>
+class basic_pbf_builder : public basic_pbf_writer<TBuffer> {
+
+    static_assert(std::is_same<pbf_tag_type, typename std::underlying_type<T>::type>::value,
+                  "T must be enum with underlying type protozero::pbf_tag_type");
+
+public:
+
+    /// The type of messages this class will build.
+    using enum_type = T;
+
+    basic_pbf_builder() = default;
+
+    /**
+     * Create a builder using the given string as a data store. The object
+     * stores a reference to that string and adds all data to it. The string
+     * doesn't have to be empty. The pbf_message object will just append data.
+     */
+    explicit basic_pbf_builder(TBuffer& data) noexcept :
+        basic_pbf_writer<TBuffer>{data} {
+    }
+
+    /**
+     * Construct a pbf_builder for a submessage from the pbf_message or
+     * pbf_writer of the parent message.
+     *
+     * @param parent_writer The parent pbf_message or pbf_writer
+     * @param tag Tag of the field that will be written
+     */
+    template <typename P>
+    basic_pbf_builder(basic_pbf_writer<TBuffer>& parent_writer, P tag) noexcept :
+        basic_pbf_writer<TBuffer>{parent_writer, pbf_tag_type(tag)} {
+    }
+
+/// @cond INTERNAL
+#define PROTOZERO_WRITER_WRAP_ADD_SCALAR(name, type) \
+    void add_##name(T tag, type value) { \
+        basic_pbf_writer<TBuffer>::add_##name(pbf_tag_type(tag), value); \
+    }
+
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(bool, bool)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(enum, int32_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(int32, int32_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(sint32, int32_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(uint32, uint32_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(int64, int64_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(sint64, int64_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(uint64, uint64_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(fixed32, uint32_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(sfixed32, int32_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(fixed64, uint64_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(sfixed64, int64_t)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(float, float)
+    PROTOZERO_WRITER_WRAP_ADD_SCALAR(double, double)
+
+#undef PROTOZERO_WRITER_WRAP_ADD_SCALAR
+/// @endcond
+
+    /**
+     * Add "bytes" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Pointer to value to be written
+     * @param size Number of bytes to be written
+     */
+    void add_bytes(T tag, const char* value, std::size_t size) {
+        basic_pbf_writer<TBuffer>::add_bytes(pbf_tag_type(tag), value, size);
+    }
+
+    /**
+     * Add "bytes" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Value to be written
+     */
+    void add_bytes(T tag, const data_view& value) {
+        basic_pbf_writer<TBuffer>::add_bytes(pbf_tag_type(tag), value);
+    }
+
+    /**
+     * Add "bytes" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Value to be written
+     */
+    void add_bytes(T tag, const std::string& value) {
+        basic_pbf_writer<TBuffer>::add_bytes(pbf_tag_type(tag), value);
+    }
+
+    /**
+     * Add "bytes" field to data. Bytes from the value are written until
+     * a null byte is encountered. The null byte is not added.
+     *
+     * @param tag Tag of the field
+     * @param value Pointer to zero-delimited value to be written
+     */
+    void add_bytes(T tag, const char* value) {
+        basic_pbf_writer<TBuffer>::add_bytes(pbf_tag_type(tag), value);
+    }
+
+    /**
+     * Add "bytes" field to data using vectored input. All the data in the
+     * 2nd and further arguments is "concatenated" with only a single copy
+     * into the final buffer.
+     *
+     * This will work with objects of any type supporting the data() and
+     * size() methods like std::string or protozero::data_view.
+     *
+     * Example:
+     * @code
+     * std::string data1 = "abc";
+     * std::string data2 = "xyz";
+     * builder.add_bytes_vectored(1, data1, data2);
+     * @endcode
+     *
+     * @tparam Ts List of types supporting data() and size() methods.
+     * @param tag Tag of the field
+     * @param values List of objects of types Ts with data to be appended.
+     */
+    template <typename... Ts>
+    void add_bytes_vectored(T tag, Ts&&... values) {
+        basic_pbf_writer<TBuffer>::add_bytes_vectored(pbf_tag_type(tag), std::forward<Ts>(values)...);
+    }
+
+    /**
+     * Add "string" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Pointer to value to be written
+     * @param size Number of bytes to be written
+     */
+    void add_string(T tag, const char* value, std::size_t size) {
+        basic_pbf_writer<TBuffer>::add_string(pbf_tag_type(tag), value, size);
+    }
+
+    /**
+     * Add "string" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Value to be written
+     */
+    void add_string(T tag, const data_view& value) {
+        basic_pbf_writer<TBuffer>::add_string(pbf_tag_type(tag), value);
+    }
+
+    /**
+     * Add "string" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Value to be written
+     */
+    void add_string(T tag, const std::string& value) {
+        basic_pbf_writer<TBuffer>::add_string(pbf_tag_type(tag), value);
+    }
+
+    /**
+     * Add "string" field to data. Bytes from the value are written until
+     * a null byte is encountered. The null byte is not added.
+     *
+     * @param tag Tag of the field
+     * @param value Pointer to value to be written
+     */
+    void add_string(T tag, const char* value) {
+        basic_pbf_writer<TBuffer>::add_string(pbf_tag_type(tag), value);
+    }
+
+    /**
+     * Add "message" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Pointer to message to be written
+     * @param size Length of the message
+     */
+    void add_message(T tag, const char* value, std::size_t size) {
+        basic_pbf_writer<TBuffer>::add_message(pbf_tag_type(tag), value, size);
+    }
+
+    /**
+     * Add "message" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Value to be written. The value must be a complete message.
+     */
+    void add_message(T tag, const data_view& value) {
+        basic_pbf_writer<TBuffer>::add_message(pbf_tag_type(tag), value);
+    }
+
+    /**
+     * Add "message" field to data.
+     *
+     * @param tag Tag of the field
+     * @param value Value to be written. The value must be a complete message.
+     */
+    void add_message(T tag, const std::string& value) {
+        basic_pbf_writer<TBuffer>::add_message(pbf_tag_type(tag), value);
+    }
+
+/// @cond INTERNAL
+#define PROTOZERO_WRITER_WRAP_ADD_PACKED(name) \
+    template <typename InputIterator> \
+    void add_packed_##name(T tag, InputIterator first, InputIterator last) { \
+        basic_pbf_writer<TBuffer>::add_packed_##name(pbf_tag_type(tag), first, last); \
+    }
+
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(bool)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(enum)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(int32)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(sint32)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(uint32)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(int64)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(sint64)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(uint64)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(fixed32)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(sfixed32)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(fixed64)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(sfixed64)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(float)
+    PROTOZERO_WRITER_WRAP_ADD_PACKED(double)
+
+#undef PROTOZERO_WRITER_WRAP_ADD_PACKED
+/// @endcond
+
+}; // class basic_pbf_builder
+
+} // end namespace protozero
+
+#endif // PROTOZERO_BASIC_PBF_BUILDER_HPP
diff --git a/include/protozero/basic_pbf_writer.hpp b/include/protozero/basic_pbf_writer.hpp
new file mode 100644
index 00000000..f167c4d1
--- /dev/null
+++ b/include/protozero/basic_pbf_writer.hpp
@@ -0,0 +1,1054 @@
+#ifndef PROTOZERO_BASIC_PBF_WRITER_HPP
+#define PROTOZERO_BASIC_PBF_WRITER_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file basic_pbf_writer.hpp
+ *
+ * @brief Contains the basic_pbf_writer template class.
+ */
+
+#include "buffer_tmpl.hpp"
+#include "config.hpp"
+#include "data_view.hpp"
+#include "types.hpp"
+#include "varint.hpp"
+
+#if PROTOZERO_BYTE_ORDER != PROTOZERO_LITTLE_ENDIAN
+# include <protozero/byteswap.hpp>
+#endif
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <utility>
+
+namespace protozero {
+
+namespace detail {
+
+    template <typename B, typename T> class packed_field_varint;
+    template <typename B, typename T> class packed_field_svarint;
+    template <typename B, typename T> class packed_field_fixed;
+
+} // end namespace detail
+
+/**
+ * The basic_pbf_writer is used to write PBF formatted messages into a buffer.
+ *
+ * This uses TBuffer as the type for the underlaying buffer. In typical uses
+ * this is std::string, but you can use a different type that must support
+ * the right interface. Please see the documentation for details.
+ *
+ * Almost all methods in this class can throw an std::bad_alloc exception if
+ * the underlying buffer class wants to resize.
+ */
+template <typename TBuffer>
+class basic_pbf_writer {
+
+    // A pointer to a buffer holding the data already written to the PBF
+    // message. For default constructed writers or writers that have been
+    // rolled back, this is a nullptr.
+    TBuffer* m_data = nullptr;
+
+    // A pointer to a parent writer object if this is a submessage. If this
+    // is a top-level writer, it is a nullptr.
+    basic_pbf_writer* m_parent_writer = nullptr;
+
+    // This is usually 0. If there is an open submessage, this is set in the
+    // parent to the rollback position, ie. the last position before the
+    // submessage was started. This is the position where the header of the
+    // submessage starts.
+    std::size_t m_rollback_pos = 0;
+
+    // This is usually 0. If there is an open submessage, this is set in the
+    // parent to the position where the data of the submessage is written to.
+    std::size_t m_pos = 0;
+
+    void add_varint(uint64_t value) {
+        protozero_assert(m_pos == 0 && "you can't add fields to a parent basic_pbf_writer if there is an existing basic_pbf_writer for a submessage");
+        protozero_assert(m_data);
+        add_varint_to_buffer(m_data, value);
+    }
+
+    void add_field(pbf_tag_type tag, pbf_wire_type type) {
+        protozero_assert(((tag > 0 && tag < 19000) || (tag > 19999 && tag <= ((1U << 29U) - 1))) && "tag out of range");
+        const uint32_t b = (tag << 3U) | uint32_t(type);
+        add_varint(b);
+    }
+
+    void add_tagged_varint(pbf_tag_type tag, uint64_t value) {
+        add_field(tag, pbf_wire_type::varint);
+        add_varint(value);
+    }
+
+    template <typename T>
+    void add_fixed(T value) {
+        protozero_assert(m_pos == 0 && "you can't add fields to a parent basic_pbf_writer if there is an existing basic_pbf_writer for a submessage");
+        protozero_assert(m_data);
+#if PROTOZERO_BYTE_ORDER != PROTOZERO_LITTLE_ENDIAN
+        byteswap_inplace(&value);
+#endif
+        buffer_customization<TBuffer>::append(m_data, reinterpret_cast<const char*>(&value), sizeof(T));
+    }
+
+    template <typename T, typename It>
+    void add_packed_fixed(pbf_tag_type tag, It first, It last, std::input_iterator_tag /*unused*/) {
+        if (first == last) {
+            return;
+        }
+
+        basic_pbf_writer sw{*this, tag};
+
+        while (first != last) {
+            sw.add_fixed<T>(*first++);
+        }
+    }
+
+    template <typename T, typename It>
+    void add_packed_fixed(pbf_tag_type tag, It first, It last, std::forward_iterator_tag /*unused*/) {
+        if (first == last) {
+            return;
+        }
+
+        const auto length = std::distance(first, last);
+        add_length_varint(tag, sizeof(T) * pbf_length_type(length));
+        reserve(sizeof(T) * std::size_t(length));
+
+        while (first != last) {
+            add_fixed<T>(*first++);
+        }
+    }
+
+    template <typename It>
+    void add_packed_varint(pbf_tag_type tag, It first, It last) {
+        if (first == last) {
+            return;
+        }
+
+        basic_pbf_writer sw{*this, tag};
+
+        while (first != last) {
+            sw.add_varint(uint64_t(*first++));
+        }
+    }
+
+    template <typename It>
+    void add_packed_svarint(pbf_tag_type tag, It first, It last) {
+        if (first == last) {
+            return;
+        }
+
+        basic_pbf_writer sw{*this, tag};
+
+        while (first != last) {
+            sw.add_varint(encode_zigzag64(*first++));
+        }
+    }
+
+    // The number of bytes to reserve for the varint holding the length of
+    // a length-delimited field. The length has to fit into pbf_length_type,
+    // and a varint needs 8 bit for every 7 bit.
+    enum : int {
+        reserve_bytes = sizeof(pbf_length_type) * 8 / 7 + 1
+    };
+
+    // If m_rollpack_pos is set to this special value, it means that when
+    // the submessage is closed, nothing needs to be done, because the length
+    // of the submessage has already been written correctly.
+    enum : std::size_t {
+        size_is_known = std::numeric_limits<std::size_t>::max()
+    };
+
+    void open_submessage(pbf_tag_type tag, std::size_t size) {
+        protozero_assert(m_pos == 0);
+        protozero_assert(m_data);
+        if (size == 0) {
+            m_rollback_pos = buffer_customization<TBuffer>::size(m_data);
+            add_field(tag, pbf_wire_type::length_delimited);
+            buffer_customization<TBuffer>::append_zeros(m_data, std::size_t(reserve_bytes));
+        } else {
+            m_rollback_pos = size_is_known;
+            add_length_varint(tag, pbf_length_type(size));
+            reserve(size);
+        }
+        m_pos = buffer_customization<TBuffer>::size(m_data);
+    }
+
+    void rollback_submessage() {
+        protozero_assert(m_pos != 0);
+        protozero_assert(m_rollback_pos != size_is_known);
+        protozero_assert(m_data);
+        buffer_customization<TBuffer>::resize(m_data, m_rollback_pos);
+        m_pos = 0;
+    }
+
+    void commit_submessage() {
+        protozero_assert(m_pos != 0);
+        protozero_assert(m_rollback_pos != size_is_known);
+        protozero_assert(m_data);
+        const auto length = pbf_length_type(buffer_customization<TBuffer>::size(m_data) - m_pos);
+
+        protozero_assert(buffer_customization<TBuffer>::size(m_data) >= m_pos - reserve_bytes);
+        const auto n = add_varint_to_buffer(buffer_customization<TBuffer>::at_pos(m_data, m_pos - reserve_bytes), length);
+
+        buffer_customization<TBuffer>::erase_range(m_data, m_pos - reserve_bytes + n, m_pos);
+        m_pos = 0;
+    }
+
+    void close_submessage() {
+        protozero_assert(m_data);
+        if (m_pos == 0 || m_rollback_pos == size_is_known) {
+            return;
+        }
+        if (buffer_customization<TBuffer>::size(m_data) - m_pos == 0) {
+            rollback_submessage();
+        } else {
+            commit_submessage();
+        }
+    }
+
+    void add_length_varint(pbf_tag_type tag, pbf_length_type length) {
+        add_field(tag, pbf_wire_type::length_delimited);
+        add_varint(length);
+    }
+
+public:
+
+    /**
+     * Create a writer using the specified buffer as a data store. The
+     * basic_pbf_writer stores a pointer to that buffer and adds all data to
+     * it. The buffer doesn't have to be empty. The basic_pbf_writer will just
+     * append data.
+     */
+    explicit basic_pbf_writer(TBuffer& buffer) noexcept :
+        m_data{&buffer} {
+    }
+
+    /**
+     * Create a writer without a data store. In this form the writer can not
+     * be used!
+     */
+    basic_pbf_writer() noexcept = default;
+
+    /**
+     * Construct a basic_pbf_writer for a submessage from the basic_pbf_writer
+     * of the parent message.
+     *
+     * @param parent_writer The basic_pbf_writer
+     * @param tag Tag (field number) of the field that will be written
+     * @param size Optional size of the submessage in bytes (use 0 for unknown).
+     *        Setting this allows some optimizations but is only possible in
+     *        a few very specific cases.
+     */
+    basic_pbf_writer(basic_pbf_writer& parent_writer, pbf_tag_type tag, std::size_t size = 0) :
+        m_data{parent_writer.m_data},
+        m_parent_writer{&parent_writer} {
+        m_parent_writer->open_submessage(tag, size);
+    }
+
+    /// A basic_pbf_writer object can not be copied
+    basic_pbf_writer(const basic_pbf_writer&) = delete;
+
+    /// A basic_pbf_writer object can not be copied
+    basic_pbf_writer& operator=(const basic_pbf_writer&) = delete;
+
+    /**
+     * A basic_pbf_writer object can be moved. After this the other
+     * basic_pbf_writer will be invalid.
+     */
+    basic_pbf_writer(basic_pbf_writer&& other) noexcept :
+        m_data{other.m_data},
+        m_parent_writer{other.m_parent_writer},
+        m_rollback_pos{other.m_rollback_pos},
+        m_pos{other.m_pos} {
+        other.m_data = nullptr;
+        other.m_parent_writer = nullptr;
+        other.m_rollback_pos = 0;
+        other.m_pos = 0;
+    }
+
+    /**
+     * A basic_pbf_writer object can be moved. After this the other
+     * basic_pbf_writer will be invalid.
+     */
+    basic_pbf_writer& operator=(basic_pbf_writer&& other) noexcept {
+        m_data = other.m_data;
+        m_parent_writer = other.m_parent_writer;
+        m_rollback_pos = other.m_rollback_pos;
+        m_pos = other.m_pos;
+        other.m_data = nullptr;
+        other.m_parent_writer = nullptr;
+        other.m_rollback_pos = 0;
+        other.m_pos = 0;
+        return *this;
+    }
+
+    ~basic_pbf_writer() noexcept {
+        try {
+            if (m_parent_writer != nullptr) {
+                m_parent_writer->close_submessage();
+            }
+        } catch (...) {
+            // This try/catch is used to make the destructor formally noexcept.
+            // close_submessage() is not noexcept, but will not throw the way
+            // it is called here, so we are good. But to be paranoid, call...
+            std::terminate();
+        }
+    }
+
+    /**
+     * Check if this writer is valid. A writer is invalid if it was default
+     * constructed, moved from, or if commit() has been called on it.
+     * Otherwise it is valid.
+     */
+    bool valid() const noexcept {
+        return m_data != nullptr;
+    }
+
+    /**
+     * Swap the contents of this object with the other.
+     *
+     * @param other Other object to swap data with.
+     */
+    void swap(basic_pbf_writer& other) noexcept {
+        using std::swap;
+        swap(m_data, other.m_data);
+        swap(m_parent_writer, other.m_parent_writer);
+        swap(m_rollback_pos, other.m_rollback_pos);
+        swap(m_pos, other.m_pos);
+    }
+
+    /**
+     * Reserve size bytes in the underlying message store in addition to
+     * whatever the message store already holds. So unlike
+     * the `std::string::reserve()` method this is not an absolute size,
+     * but additional memory that should be reserved.
+     *
+     * @param size Number of bytes to reserve in underlying message store.
+     */
+    void reserve(std::size_t size) {
+        protozero_assert(m_data);
+        buffer_customization<TBuffer>::reserve_additional(m_data, size);
+    }
+
+    /**
+     * Commit this submessage. This does the same as when the basic_pbf_writer
+     * goes out of scope and is destructed.
+     *
+     * @pre Must be a basic_pbf_writer of a submessage, ie one opened with the
+     *      basic_pbf_writer constructor taking a parent message.
+     * @post The basic_pbf_writer is invalid and can't be used any more.
+     */
+    void commit() {
+        protozero_assert(m_parent_writer && "you can't call commit() on a basic_pbf_writer without a parent");
+        protozero_assert(m_pos == 0 && "you can't call commit() on a basic_pbf_writer that has an open nested submessage");
+        m_parent_writer->close_submessage();
+        m_parent_writer = nullptr;
+        m_data = nullptr;
+    }
+
+    /**
+     * Cancel writing of this submessage. The complete submessage will be
+     * removed as if it was never created and no fields were added.
+     *
+     * @pre Must be a basic_pbf_writer of a submessage, ie one opened with the
+     *      basic_pbf_writer constructor taking a parent message.
+     * @post The basic_pbf_writer is invalid and can't be used any more.
+     */
+    void rollback() {
+        protozero_assert(m_parent_writer && "you can't call rollback() on a basic_pbf_writer without a parent");
+        protozero_assert(m_pos == 0 && "you can't call rollback() on a basic_pbf_writer that has an open nested submessage");
+        m_parent_writer->rollback_submessage();
+        m_parent_writer = nullptr;
+        m_data = nullptr;
+    }
+
+    ///@{
+    /**
+     * @name Scalar field writer functions
+     */
+
+    /**
+     * Add "bool" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_bool(pbf_tag_type tag, bool value) {
+        add_field(tag, pbf_wire_type::varint);
+        protozero_assert(m_pos == 0 && "you can't add fields to a parent basic_pbf_writer if there is an existing basic_pbf_writer for a submessage");
+        protozero_assert(m_data);
+        m_data->push_back(char(value));
+    }
+
+    /**
+     * Add "enum" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_enum(pbf_tag_type tag, int32_t value) {
+        add_tagged_varint(tag, uint64_t(value));
+    }
+
+    /**
+     * Add "int32" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_int32(pbf_tag_type tag, int32_t value) {
+        add_tagged_varint(tag, uint64_t(value));
+    }
+
+    /**
+     * Add "sint32" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_sint32(pbf_tag_type tag, int32_t value) {
+        add_tagged_varint(tag, encode_zigzag32(value));
+    }
+
+    /**
+     * Add "uint32" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_uint32(pbf_tag_type tag, uint32_t value) {
+        add_tagged_varint(tag, value);
+    }
+
+    /**
+     * Add "int64" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_int64(pbf_tag_type tag, int64_t value) {
+        add_tagged_varint(tag, uint64_t(value));
+    }
+
+    /**
+     * Add "sint64" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_sint64(pbf_tag_type tag, int64_t value) {
+        add_tagged_varint(tag, encode_zigzag64(value));
+    }
+
+    /**
+     * Add "uint64" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_uint64(pbf_tag_type tag, uint64_t value) {
+        add_tagged_varint(tag, value);
+    }
+
+    /**
+     * Add "fixed32" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_fixed32(pbf_tag_type tag, uint32_t value) {
+        add_field(tag, pbf_wire_type::fixed32);
+        add_fixed<uint32_t>(value);
+    }
+
+    /**
+     * Add "sfixed32" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_sfixed32(pbf_tag_type tag, int32_t value) {
+        add_field(tag, pbf_wire_type::fixed32);
+        add_fixed<int32_t>(value);
+    }
+
+    /**
+     * Add "fixed64" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_fixed64(pbf_tag_type tag, uint64_t value) {
+        add_field(tag, pbf_wire_type::fixed64);
+        add_fixed<uint64_t>(value);
+    }
+
+    /**
+     * Add "sfixed64" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_sfixed64(pbf_tag_type tag, int64_t value) {
+        add_field(tag, pbf_wire_type::fixed64);
+        add_fixed<int64_t>(value);
+    }
+
+    /**
+     * Add "float" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_float(pbf_tag_type tag, float value) {
+        add_field(tag, pbf_wire_type::fixed32);
+        add_fixed<float>(value);
+    }
+
+    /**
+     * Add "double" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_double(pbf_tag_type tag, double value) {
+        add_field(tag, pbf_wire_type::fixed64);
+        add_fixed<double>(value);
+    }
+
+    /**
+     * Add "bytes" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Pointer to value to be written
+     * @param size Number of bytes to be written
+     */
+    void add_bytes(pbf_tag_type tag, const char* value, std::size_t size) {
+        protozero_assert(m_pos == 0 && "you can't add fields to a parent basic_pbf_writer if there is an existing basic_pbf_writer for a submessage");
+        protozero_assert(m_data);
+        protozero_assert(size <= std::numeric_limits<pbf_length_type>::max());
+        add_length_varint(tag, pbf_length_type(size));
+        buffer_customization<TBuffer>::append(m_data, value, size);
+    }
+
+    /**
+     * Add "bytes" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_bytes(pbf_tag_type tag, const data_view& value) {
+        add_bytes(tag, value.data(), value.size());
+    }
+
+    /**
+     * Add "bytes" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_bytes(pbf_tag_type tag, const std::string& value) {
+        add_bytes(tag, value.data(), value.size());
+    }
+
+    /**
+     * Add "bytes" field to data. Bytes from the value are written until
+     * a null byte is encountered. The null byte is not added.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Pointer to zero-delimited value to be written
+     */
+    void add_bytes(pbf_tag_type tag, const char* value) {
+        add_bytes(tag, value, std::strlen(value));
+    }
+
+    /**
+     * Add "bytes" field to data using vectored input. All the data in the
+     * 2nd and further arguments is "concatenated" with only a single copy
+     * into the final buffer.
+     *
+     * This will work with objects of any type supporting the data() and
+     * size() methods like std::string or protozero::data_view.
+     *
+     * Example:
+     * @code
+     * std::string data1 = "abc";
+     * std::string data2 = "xyz";
+     * writer.add_bytes_vectored(1, data1, data2);
+     * @endcode
+     *
+     * @tparam Ts List of types supporting data() and size() methods.
+     * @param tag Tag (field number) of the field
+     * @param values List of objects of types Ts with data to be appended.
+     */
+    template <typename... Ts>
+    void add_bytes_vectored(pbf_tag_type tag, Ts&&... values) {
+        protozero_assert(m_pos == 0 && "you can't add fields to a parent basic_pbf_writer if there is an existing basic_pbf_writer for a submessage");
+        protozero_assert(m_data);
+        size_t sum_size = 0;
+        (void)std::initializer_list<size_t>{sum_size += values.size()...};
+        protozero_assert(sum_size <= std::numeric_limits<pbf_length_type>::max());
+        add_length_varint(tag, pbf_length_type(sum_size));
+        buffer_customization<TBuffer>::reserve_additional(m_data, sum_size);
+        (void)std::initializer_list<int>{(buffer_customization<TBuffer>::append(m_data, values.data(), values.size()), 0)...};
+    }
+
+    /**
+     * Add "string" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Pointer to value to be written
+     * @param size Number of bytes to be written
+     */
+    void add_string(pbf_tag_type tag, const char* value, std::size_t size) {
+        add_bytes(tag, value, size);
+    }
+
+    /**
+     * Add "string" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_string(pbf_tag_type tag, const data_view& value) {
+        add_bytes(tag, value.data(), value.size());
+    }
+
+    /**
+     * Add "string" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written
+     */
+    void add_string(pbf_tag_type tag, const std::string& value) {
+        add_bytes(tag, value.data(), value.size());
+    }
+
+    /**
+     * Add "string" field to data. Bytes from the value are written until
+     * a null byte is encountered. The null byte is not added.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Pointer to value to be written
+     */
+    void add_string(pbf_tag_type tag, const char* value) {
+        add_bytes(tag, value, std::strlen(value));
+    }
+
+    /**
+     * Add "message" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Pointer to message to be written
+     * @param size Length of the message
+     */
+    void add_message(pbf_tag_type tag, const char* value, std::size_t size) {
+        add_bytes(tag, value, size);
+    }
+
+    /**
+     * Add "message" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written. The value must be a complete message.
+     */
+    void add_message(pbf_tag_type tag, const data_view& value) {
+        add_bytes(tag, value.data(), value.size());
+    }
+
+    /**
+     * Add "message" field to data.
+     *
+     * @param tag Tag (field number) of the field
+     * @param value Value to be written. The value must be a complete message.
+     */
+    void add_message(pbf_tag_type tag, const std::string& value) {
+        add_bytes(tag, value.data(), value.size());
+    }
+
+    ///@}
+
+    ///@{
+    /**
+     * @name Repeated packed field writer functions
+     */
+
+    /**
+     * Add "repeated packed bool" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to bool.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_bool(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_varint(tag, first, last);
+    }
+
+    /**
+     * Add "repeated packed enum" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to int32_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_enum(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_varint(tag, first, last);
+    }
+
+    /**
+     * Add "repeated packed int32" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to int32_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_int32(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_varint(tag, first, last);
+    }
+
+    /**
+     * Add "repeated packed sint32" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to int32_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_sint32(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_svarint(tag, first, last);
+    }
+
+    /**
+     * Add "repeated packed uint32" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to uint32_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_uint32(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_varint(tag, first, last);
+    }
+
+    /**
+     * Add "repeated packed int64" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to int64_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_int64(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_varint(tag, first, last);
+    }
+
+    /**
+     * Add "repeated packed sint64" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to int64_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_sint64(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_svarint(tag, first, last);
+    }
+
+    /**
+     * Add "repeated packed uint64" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to uint64_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_uint64(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_varint(tag, first, last);
+    }
+
+    /**
+     * Add a "repeated packed" fixed-size field to data. The following
+     * fixed-size fields are available:
+     *
+     * uint32_t -> repeated packed fixed32
+     * int32_t  -> repeated packed sfixed32
+     * uint64_t -> repeated packed fixed64
+     * int64_t  -> repeated packed sfixed64
+     * double   -> repeated packed double
+     * float    -> repeated packed float
+     *
+     * @tparam ValueType One of the following types: (u)int32/64_t, double, float.
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename ValueType, typename InputIterator>
+    void add_packed_fixed(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        static_assert(std::is_same<ValueType, uint32_t>::value ||
+                      std::is_same<ValueType, int32_t>::value ||
+                      std::is_same<ValueType, int64_t>::value ||
+                      std::is_same<ValueType, uint64_t>::value ||
+                      std::is_same<ValueType, double>::value ||
+                      std::is_same<ValueType, float>::value, "Only some types are allowed");
+        add_packed_fixed<ValueType, InputIterator>(tag, first, last,
+            typename std::iterator_traits<InputIterator>::iterator_category{});
+    }
+
+    /**
+     * Add "repeated packed fixed32" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to uint32_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_fixed32(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_fixed<uint32_t, InputIterator>(tag, first, last,
+            typename std::iterator_traits<InputIterator>::iterator_category{});
+    }
+
+    /**
+     * Add "repeated packed sfixed32" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to int32_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_sfixed32(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_fixed<int32_t, InputIterator>(tag, first, last,
+            typename std::iterator_traits<InputIterator>::iterator_category{});
+    }
+
+    /**
+     * Add "repeated packed fixed64" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to uint64_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_fixed64(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_fixed<uint64_t, InputIterator>(tag, first, last,
+            typename std::iterator_traits<InputIterator>::iterator_category{});
+    }
+
+    /**
+     * Add "repeated packed sfixed64" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to int64_t.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_sfixed64(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_fixed<int64_t, InputIterator>(tag, first, last,
+            typename std::iterator_traits<InputIterator>::iterator_category{});
+    }
+
+    /**
+     * Add "repeated packed float" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to float.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_float(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_fixed<float, InputIterator>(tag, first, last,
+            typename std::iterator_traits<InputIterator>::iterator_category{});
+    }
+
+    /**
+     * Add "repeated packed double" field to data.
+     *
+     * @tparam InputIterator A type satisfying the InputIterator concept.
+     *         Dereferencing the iterator must yield a type assignable to double.
+     * @param tag Tag (field number) of the field
+     * @param first Iterator pointing to the beginning of the data
+     * @param last Iterator pointing one past the end of data
+     */
+    template <typename InputIterator>
+    void add_packed_double(pbf_tag_type tag, InputIterator first, InputIterator last) {
+        add_packed_fixed<double, InputIterator>(tag, first, last,
+            typename std::iterator_traits<InputIterator>::iterator_category{});
+    }
+
+    ///@}
+
+    template <typename B, typename T> friend class detail::packed_field_varint;
+    template <typename B, typename T> friend class detail::packed_field_svarint;
+    template <typename B, typename T> friend class detail::packed_field_fixed;
+
+}; // class basic_pbf_writer
+
+/**
+ * Swap two basic_pbf_writer objects.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+template <typename TBuffer>
+inline void swap(basic_pbf_writer<TBuffer>& lhs, basic_pbf_writer<TBuffer>& rhs) noexcept {
+    lhs.swap(rhs);
+}
+
+namespace detail {
+
+    template <typename TBuffer>
+    class packed_field {
+
+        basic_pbf_writer<TBuffer> m_writer{};
+
+    public:
+
+        packed_field(const packed_field&) = delete;
+        packed_field& operator=(const packed_field&) = delete;
+
+        packed_field(packed_field&&) noexcept = default;
+        packed_field& operator=(packed_field&&) noexcept = default;
+
+        packed_field() = default;
+
+        packed_field(basic_pbf_writer<TBuffer>& parent_writer, pbf_tag_type tag) :
+            m_writer{parent_writer, tag} {
+        }
+
+        packed_field(basic_pbf_writer<TBuffer>& parent_writer, pbf_tag_type tag, std::size_t size) :
+            m_writer{parent_writer, tag, size} {
+        }
+
+        ~packed_field() noexcept = default;
+
+        bool valid() const noexcept {
+            return m_writer.valid();
+        }
+
+        void commit() {
+            m_writer.commit();
+        }
+
+        void rollback() {
+            m_writer.rollback();
+        }
+
+        basic_pbf_writer<TBuffer>& writer() noexcept {
+            return m_writer;
+        }
+
+    }; // class packed_field
+
+    template <typename TBuffer, typename T>
+    class packed_field_fixed : public packed_field<TBuffer> {
+
+    public:
+
+        packed_field_fixed() :
+            packed_field<TBuffer>{} {
+        }
+
+        template <typename P>
+        packed_field_fixed(basic_pbf_writer<TBuffer>& parent_writer, P tag) :
+            packed_field<TBuffer>{parent_writer, static_cast<pbf_tag_type>(tag)} {
+        }
+
+        template <typename P>
+        packed_field_fixed(basic_pbf_writer<TBuffer>& parent_writer, P tag, std::size_t size) :
+            packed_field<TBuffer>{parent_writer, static_cast<pbf_tag_type>(tag), size * sizeof(T)} {
+        }
+
+        void add_element(T value) {
+            this->writer().template add_fixed<T>(value);
+        }
+
+    }; // class packed_field_fixed
+
+    template <typename TBuffer, typename T>
+    class packed_field_varint : public packed_field<TBuffer> {
+
+    public:
+
+        packed_field_varint() :
+            packed_field<TBuffer>{} {
+        }
+
+        template <typename P>
+        packed_field_varint(basic_pbf_writer<TBuffer>& parent_writer, P tag) :
+            packed_field<TBuffer>{parent_writer, static_cast<pbf_tag_type>(tag)} {
+        }
+
+        void add_element(T value) {
+            this->writer().add_varint(uint64_t(value));
+        }
+
+    }; // class packed_field_varint
+
+    template <typename TBuffer, typename T>
+    class packed_field_svarint : public packed_field<TBuffer> {
+
+    public:
+
+        packed_field_svarint() :
+            packed_field<TBuffer>{} {
+        }
+
+        template <typename P>
+        packed_field_svarint(basic_pbf_writer<TBuffer>& parent_writer, P tag) :
+            packed_field<TBuffer>{parent_writer, static_cast<pbf_tag_type>(tag)} {
+        }
+
+        void add_element(T value) {
+            this->writer().add_varint(encode_zigzag64(value));
+        }
+
+    }; // class packed_field_svarint
+
+} // end namespace detail
+
+} // end namespace protozero
+
+#endif // PROTOZERO_BASIC_PBF_WRITER_HPP
diff --git a/include/protozero/buffer_fixed.hpp b/include/protozero/buffer_fixed.hpp
new file mode 100644
index 00000000..b2e6d1d2
--- /dev/null
+++ b/include/protozero/buffer_fixed.hpp
@@ -0,0 +1,222 @@
+#ifndef PROTOZERO_BUFFER_FIXED_HPP
+#define PROTOZERO_BUFFER_FIXED_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file buffer_fixed.hpp
+ *
+ * @brief Contains the fixed_size_buffer_adaptor class.
+ */
+
+#include "buffer_tmpl.hpp"
+#include "config.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <stdexcept>
+
+namespace protozero {
+
+/**
+ * This class can be used instead of std::string if you want to create a
+ * vector tile in a fixed-size buffer. Any operation that needs more space
+ * than is available will fail with a std::length_error exception.
+ */
+class fixed_size_buffer_adaptor {
+
+    char* m_data;
+    std::size_t m_capacity;
+    std::size_t m_size = 0;
+
+public:
+
+    /// @cond usual container typedefs not documented
+
+    using size_type = std::size_t;
+
+    using value_type = char;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+
+    using iterator = pointer;
+    using const_iterator = const_pointer;
+
+    /// @endcond
+
+    /**
+     * Constructor.
+     *
+     * @param data Pointer to some memory allocated for the buffer.
+     * @param capacity Number of bytes available.
+     */
+    fixed_size_buffer_adaptor(char* data, std::size_t capacity) noexcept :
+        m_data(data),
+        m_capacity(capacity) {
+    }
+
+    /**
+     * Constructor.
+     *
+     * @param container Some container class supporting the member functions
+     *        data() and size().
+     */
+    template <typename T>
+    explicit fixed_size_buffer_adaptor(T& container) :
+        m_data(container.data()),
+        m_capacity(container.size()) {
+    }
+
+    /// Returns a pointer to the data in the buffer.
+    const char* data() const noexcept {
+        return m_data;
+    }
+
+    /// Returns a pointer to the data in the buffer.
+    char* data() noexcept {
+        return m_data;
+    }
+
+    /// The capacity this buffer was created with.
+    std::size_t capacity() const noexcept {
+        return m_capacity;
+    }
+
+    /// The number of bytes used in the buffer. Always <= capacity().
+    std::size_t size() const noexcept {
+        return m_size;
+    }
+
+    /// Return iterator to beginning of data.
+    char* begin() noexcept {
+        return m_data;
+    }
+
+    /// Return iterator to beginning of data.
+    const char* begin() const noexcept {
+        return m_data;
+    }
+
+    /// Return iterator to beginning of data.
+    const char* cbegin() const noexcept {
+        return m_data;
+    }
+
+    /// Return iterator to end of data.
+    char* end() noexcept {
+        return m_data + m_size;
+    }
+
+    /// Return iterator to end of data.
+    const char* end() const noexcept {
+        return m_data + m_size;
+    }
+
+    /// Return iterator to end of data.
+    const char* cend() const noexcept {
+        return m_data + m_size;
+    }
+
+/// @cond INTERNAL
+
+    // Do not rely on anything beyond this point
+
+    void append(const char* data, std::size_t count) {
+        if (m_size + count > m_capacity) {
+            throw std::length_error{"fixed size data store exhausted"};
+        }
+        std::copy_n(data, count, m_data + m_size);
+        m_size += count;
+    }
+
+    void append_zeros(std::size_t count) {
+        if (m_size + count > m_capacity) {
+            throw std::length_error{"fixed size data store exhausted"};
+        }
+        std::fill_n(m_data + m_size, count, '\0');
+        m_size += count;
+    }
+
+    void resize(std::size_t size) {
+        protozero_assert(size < m_size);
+        if (size > m_capacity) {
+            throw std::length_error{"fixed size data store exhausted"};
+        }
+        m_size = size;
+    }
+
+    void erase_range(std::size_t from, std::size_t to) {
+        protozero_assert(from <= m_size);
+        protozero_assert(to <= m_size);
+        protozero_assert(from < to);
+        std::copy(m_data + to, m_data + m_size, m_data + from);
+        m_size -= (to - from);
+    }
+
+    char* at_pos(std::size_t pos) {
+        protozero_assert(pos <= m_size);
+        return m_data + pos;
+    }
+
+    void push_back(char ch) {
+        if (m_size >= m_capacity) {
+            throw std::length_error{"fixed size data store exhausted"};
+        }
+        m_data[m_size++] = ch;
+    }
+/// @endcond
+
+}; // class fixed_size_buffer_adaptor
+
+/// @cond INTERNAL
+template <>
+struct buffer_customization<fixed_size_buffer_adaptor> {
+
+    static std::size_t size(const fixed_size_buffer_adaptor* buffer) noexcept {
+        return buffer->size();
+    }
+
+    static void append(fixed_size_buffer_adaptor* buffer, const char* data, std::size_t count) {
+        buffer->append(data, count);
+    }
+
+    static void append_zeros(fixed_size_buffer_adaptor* buffer, std::size_t count) {
+        buffer->append_zeros(count);
+    }
+
+    static void resize(fixed_size_buffer_adaptor* buffer, std::size_t size) {
+        buffer->resize(size);
+    }
+
+    static void reserve_additional(fixed_size_buffer_adaptor* /*buffer*/, std::size_t /*size*/) {
+        /* nothing to be done for fixed-size buffers */
+    }
+
+    static void erase_range(fixed_size_buffer_adaptor* buffer, std::size_t from, std::size_t to) {
+        buffer->erase_range(from, to);
+    }
+
+    static char* at_pos(fixed_size_buffer_adaptor* buffer, std::size_t pos) {
+        return buffer->at_pos(pos);
+    }
+
+    static void push_back(fixed_size_buffer_adaptor* buffer, char ch) {
+        buffer->push_back(ch);
+    }
+
+};
+/// @endcond
+
+} // namespace protozero
+
+#endif // PROTOZERO_BUFFER_FIXED_HPP
diff --git a/include/protozero/buffer_string.hpp b/include/protozero/buffer_string.hpp
new file mode 100644
index 00000000..02e8ad25
--- /dev/null
+++ b/include/protozero/buffer_string.hpp
@@ -0,0 +1,78 @@
+#ifndef PROTOZERO_BUFFER_STRING_HPP
+#define PROTOZERO_BUFFER_STRING_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file buffer_string.hpp
+ *
+ * @brief Contains the customization points for buffer implementation based
+ *        on std::string
+ */
+
+#include "buffer_tmpl.hpp"
+#include "config.hpp"
+
+#include <cstddef>
+#include <iterator>
+#include <string>
+
+namespace protozero {
+
+// Implementation of buffer customizations points for std::string
+
+/// @cond INTERNAL
+template <>
+struct buffer_customization<std::string> {
+
+    static std::size_t size(const std::string* buffer) noexcept {
+        return buffer->size();
+    }
+
+    static void append(std::string* buffer, const char* data, std::size_t count) {
+        buffer->append(data, count);
+    }
+
+    static void append_zeros(std::string* buffer, std::size_t count) {
+        buffer->append(count, '\0');
+    }
+
+    static void resize(std::string* buffer, std::size_t size) {
+        protozero_assert(size < buffer->size());
+        buffer->resize(size);
+    }
+
+    static void reserve_additional(std::string* buffer, std::size_t size) {
+        buffer->reserve(buffer->size() + size);
+    }
+
+    static void erase_range(std::string* buffer, std::size_t from, std::size_t to) {
+        protozero_assert(from <= buffer->size());
+        protozero_assert(to <= buffer->size());
+        protozero_assert(from <= to);
+        buffer->erase(std::next(buffer->begin(), static_cast<std::string::iterator::difference_type>(from)),
+                      std::next(buffer->begin(), static_cast<std::string::iterator::difference_type>(to)));
+    }
+
+    static char* at_pos(std::string* buffer, std::size_t pos) {
+        protozero_assert(pos <= buffer->size());
+        return (&*buffer->begin()) + pos;
+    }
+
+    static void push_back(std::string* buffer, char ch) {
+        buffer->push_back(ch);
+    }
+
+};
+/// @endcond
+
+} // namespace protozero
+
+#endif // PROTOZERO_BUFFER_STRING_HPP
diff --git a/include/protozero/buffer_tmpl.hpp b/include/protozero/buffer_tmpl.hpp
new file mode 100644
index 00000000..ac223996
--- /dev/null
+++ b/include/protozero/buffer_tmpl.hpp
@@ -0,0 +1,113 @@
+#ifndef PROTOZERO_BUFFER_TMPL_HPP
+#define PROTOZERO_BUFFER_TMPL_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file buffer_tmpl.hpp
+ *
+ * @brief Contains the customization points for buffer implementations.
+ */
+
+#include <cstddef>
+#include <iterator>
+#include <string>
+
+namespace protozero {
+
+// Implementation of buffer customizations points for std::string
+
+/// @cond INTERNAL
+template <typename T>
+struct buffer_customization {
+
+    /**
+     * Get the number of bytes currently used in the buffer.
+     *
+     * @param buffer Pointer to the buffer.
+     * @returns number of bytes used in the buffer.
+     */
+    static std::size_t size(const std::string* buffer);
+
+    /**
+     * Append count bytes from data to the buffer.
+     *
+     * @param buffer Pointer to the buffer.
+     * @param data Pointer to the data.
+     * @param count Number of bytes to be added to the buffer.
+     */
+    static void append(std::string* buffer, const char* data, std::size_t count);
+
+    /**
+     * Append count zero bytes to the buffer.
+     *
+     * @param buffer Pointer to the buffer.
+     * @param count Number of bytes to be added to the buffer.
+     */
+    static void append_zeros(std::string* buffer, std::size_t count);
+
+    /**
+     * Shrink the buffer to the specified size. The new size will always be
+     * smaller than the current size.
+     *
+     * @param buffer Pointer to the buffer.
+     * @param size New size of the buffer.
+     *
+     * @pre size < current size of buffer
+     */
+    static void resize(std::string* buffer, std::size_t size);
+
+    /**
+     * Reserve an additional size bytes for use in the buffer. This is used for
+     * variable-sized buffers to tell the buffer implementation that soon more
+     * memory will be used. The implementation can ignore this.
+     *
+     * @param buffer Pointer to the buffer.
+     * @param size Number of bytes to reserve.
+     */
+    static void reserve_additional(std::string* buffer, std::size_t size);
+
+    /**
+     * Delete data from the buffer. This must move back the data after the
+     * part being deleted and resize the buffer accordingly.
+     *
+     * @param buffer Pointer to the buffer.
+     * @param from Offset into the buffer where we want to erase from.
+     * @param to Offset into the buffer one past the last byte we want to erase.
+     *
+     * @pre from, to <= size of the buffer, from < to
+     */
+    static void erase_range(std::string* buffer, std::size_t from, std::size_t to);
+
+    /**
+     * Return a pointer to the memory at the specified position in the buffer.
+     *
+     * @param buffer Pointer to the buffer.
+     * @param pos The position in the buffer.
+     * @returns pointer to the memory in the buffer at the specified position.
+     *
+     * @pre pos <= size of the buffer
+     */
+    static char* at_pos(std::string* buffer, std::size_t pos);
+
+    /**
+     * Add a char to the buffer incrementing the number of chars in the buffer.
+     *
+     * @param buffer Pointer to the buffer.
+     * @param ch The character to add.
+     */
+    static void push_back(std::string* buffer, char ch);
+
+};
+/// @endcond
+
+} // namespace protozero
+
+#endif // PROTOZERO_BUFFER_TMPL_HPP
diff --git a/include/protozero/buffer_vector.hpp b/include/protozero/buffer_vector.hpp
new file mode 100644
index 00000000..c163300c
--- /dev/null
+++ b/include/protozero/buffer_vector.hpp
@@ -0,0 +1,78 @@
+#ifndef PROTOZERO_BUFFER_VECTOR_HPP
+#define PROTOZERO_BUFFER_VECTOR_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file buffer_vector.hpp
+ *
+ * @brief Contains the customization points for buffer implementation based
+ *        on std::vector<char>
+ */
+
+#include "buffer_tmpl.hpp"
+#include "config.hpp"
+
+#include <cstddef>
+#include <iterator>
+#include <vector>
+
+namespace protozero {
+
+// Implementation of buffer customizations points for std::vector<char>
+
+/// @cond INTERNAL
+template <>
+struct buffer_customization<std::vector<char>> {
+
+    static std::size_t size(const std::vector<char>* buffer) noexcept {
+        return buffer->size();
+    }
+
+    static void append(std::vector<char>* buffer, const char* data, std::size_t count) {
+        buffer->insert(buffer->end(), data, data + count);
+    }
+
+    static void append_zeros(std::vector<char>* buffer, std::size_t count) {
+        buffer->insert(buffer->end(), count, '\0');
+    }
+
+    static void resize(std::vector<char>* buffer, std::size_t size) {
+        protozero_assert(size < buffer->size());
+        buffer->resize(size);
+    }
+
+    static void reserve_additional(std::vector<char>* buffer, std::size_t size) {
+        buffer->reserve(buffer->size() + size);
+    }
+
+    static void erase_range(std::vector<char>* buffer, std::size_t from, std::size_t to) {
+        protozero_assert(from <= buffer->size());
+        protozero_assert(to <= buffer->size());
+        protozero_assert(from <= to);
+        buffer->erase(std::next(buffer->begin(), static_cast<std::string::iterator::difference_type>(from)),
+                      std::next(buffer->begin(), static_cast<std::string::iterator::difference_type>(to)));
+    }
+
+    static char* at_pos(std::vector<char>* buffer, std::size_t pos) {
+        protozero_assert(pos <= buffer->size());
+        return (&*buffer->begin()) + pos;
+    }
+
+    static void push_back(std::vector<char>* buffer, char ch) {
+        buffer->push_back(ch);
+    }
+
+};
+/// @endcond
+
+} // namespace protozero
+
+#endif // PROTOZERO_BUFFER_VECTOR_HPP
diff --git a/include/protozero/byteswap.hpp b/include/protozero/byteswap.hpp
new file mode 100644
index 00000000..75cae691
--- /dev/null
+++ b/include/protozero/byteswap.hpp
@@ -0,0 +1,108 @@
+#ifndef PROTOZERO_BYTESWAP_HPP
+#define PROTOZERO_BYTESWAP_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file byteswap.hpp
+ *
+ * @brief Contains functions to swap bytes in values (for different endianness).
+ */
+
+#include "config.hpp"
+
+#include <cstdint>
+#include <cstring>
+
+namespace protozero {
+namespace detail {
+
+inline uint32_t byteswap_impl(uint32_t value) noexcept {
+#ifdef PROTOZERO_USE_BUILTIN_BSWAP
+    return __builtin_bswap32(value);
+#else
+    return ((value & 0xff000000U) >> 24U) |
+           ((value & 0x00ff0000U) >>  8U) |
+           ((value & 0x0000ff00U) <<  8U) |
+           ((value & 0x000000ffU) << 24U);
+#endif
+}
+
+inline uint64_t byteswap_impl(uint64_t value) noexcept {
+#ifdef PROTOZERO_USE_BUILTIN_BSWAP
+    return __builtin_bswap64(value);
+#else
+    return ((value & 0xff00000000000000ULL) >> 56U) |
+           ((value & 0x00ff000000000000ULL) >> 40U) |
+           ((value & 0x0000ff0000000000ULL) >> 24U) |
+           ((value & 0x000000ff00000000ULL) >>  8U) |
+           ((value & 0x00000000ff000000ULL) <<  8U) |
+           ((value & 0x0000000000ff0000ULL) << 24U) |
+           ((value & 0x000000000000ff00ULL) << 40U) |
+           ((value & 0x00000000000000ffULL) << 56U);
+#endif
+}
+
+} // end namespace detail
+
+/// byteswap the data pointed to by ptr in-place.
+inline void byteswap_inplace(uint32_t* ptr) noexcept {
+    *ptr = detail::byteswap_impl(*ptr);
+}
+
+/// byteswap the data pointed to by ptr in-place.
+inline void byteswap_inplace(uint64_t* ptr) noexcept {
+    *ptr = detail::byteswap_impl(*ptr);
+}
+
+/// byteswap the data pointed to by ptr in-place.
+inline void byteswap_inplace(int32_t* ptr) noexcept {
+    auto* bptr = reinterpret_cast<uint32_t*>(ptr);
+    *bptr = detail::byteswap_impl(*bptr);
+}
+
+/// byteswap the data pointed to by ptr in-place.
+inline void byteswap_inplace(int64_t* ptr) noexcept {
+    auto* bptr = reinterpret_cast<uint64_t*>(ptr);
+    *bptr = detail::byteswap_impl(*bptr);
+}
+
+/// byteswap the data pointed to by ptr in-place.
+inline void byteswap_inplace(float* ptr) noexcept {
+    static_assert(sizeof(float) == 4, "Expecting four byte float");
+
+    uint32_t tmp = 0;
+    std::memcpy(&tmp, ptr, 4);
+    tmp = detail::byteswap_impl(tmp); // uint32 overload
+    std::memcpy(ptr, &tmp, 4);
+}
+
+/// byteswap the data pointed to by ptr in-place.
+inline void byteswap_inplace(double* ptr) noexcept {
+    static_assert(sizeof(double) == 8, "Expecting eight byte double");
+
+    uint64_t tmp = 0;
+    std::memcpy(&tmp, ptr, 8);
+    tmp = detail::byteswap_impl(tmp); // uint64 overload
+    std::memcpy(ptr, &tmp, 8);
+}
+
+namespace detail {
+
+    // Added for backwards compatibility with any code that might use this
+    // function (even if it shouldn't have). Will be removed in a later
+    // version of protozero.
+    using ::protozero::byteswap_inplace;
+
+} // end namespace detail
+
+} // end namespace protozero
+
+#endif // PROTOZERO_BYTESWAP_HPP
diff --git a/include/protozero/config.hpp b/include/protozero/config.hpp
new file mode 100644
index 00000000..6fc77490
--- /dev/null
+++ b/include/protozero/config.hpp
@@ -0,0 +1,48 @@
+#ifndef PROTOZERO_CONFIG_HPP
+#define PROTOZERO_CONFIG_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+#include <cassert>
+
+/**
+ * @file config.hpp
+ *
+ * @brief Contains macro checks for different configurations.
+ */
+
+#define PROTOZERO_LITTLE_ENDIAN 1234
+#define PROTOZERO_BIG_ENDIAN    4321
+
+// Find out which byte order the machine has.
+#if defined(__BYTE_ORDER)
+# if (__BYTE_ORDER == __LITTLE_ENDIAN)
+#  define PROTOZERO_BYTE_ORDER PROTOZERO_LITTLE_ENDIAN
+# endif
+# if (__BYTE_ORDER == __BIG_ENDIAN)
+#  define PROTOZERO_BYTE_ORDER PROTOZERO_BIG_ENDIAN
+# endif
+#else
+// This probably isn't a very good default, but might do until we figure
+// out something better.
+# define PROTOZERO_BYTE_ORDER PROTOZERO_LITTLE_ENDIAN
+#endif
+
+// Check whether __builtin_bswap is available
+#if defined(__GNUC__) || defined(__clang__)
+# define PROTOZERO_USE_BUILTIN_BSWAP
+#endif
+
+// Wrapper for assert() used for testing
+#ifndef protozero_assert
+# define protozero_assert(x) assert(x)
+#endif
+
+#endif // PROTOZERO_CONFIG_HPP
diff --git a/include/protozero/data_view.hpp b/include/protozero/data_view.hpp
new file mode 100644
index 00000000..3ec87af3
--- /dev/null
+++ b/include/protozero/data_view.hpp
@@ -0,0 +1,236 @@
+#ifndef PROTOZERO_DATA_VIEW_HPP
+#define PROTOZERO_DATA_VIEW_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file data_view.hpp
+ *
+ * @brief Contains the implementation of the data_view class.
+ */
+
+#include "config.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <string>
+#include <utility>
+
+namespace protozero {
+
+#ifdef PROTOZERO_USE_VIEW
+using data_view = PROTOZERO_USE_VIEW;
+#else
+
+/**
+ * Holds a pointer to some data and a length.
+ *
+ * This class is supposed to be compatible with the std::string_view
+ * that will be available in C++17.
+ */
+class data_view {
+
+    const char* m_data = nullptr;
+    std::size_t m_size = 0;
+
+public:
+
+    /**
+     * Default constructor. Construct an empty data_view.
+     */
+    constexpr data_view() noexcept = default;
+
+    /**
+     * Create data_view from pointer and size.
+     *
+     * @param ptr Pointer to the data.
+     * @param length Length of the data.
+     */
+    constexpr data_view(const char* ptr, std::size_t length) noexcept
+        : m_data{ptr},
+          m_size{length} {
+    }
+
+    /**
+     * Create data_view from string.
+     *
+     * @param str String with the data.
+     */
+    data_view(const std::string& str) noexcept // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
+        : m_data{str.data()},
+          m_size{str.size()} {
+    }
+
+    /**
+     * Create data_view from zero-terminated string.
+     *
+     * @param ptr Pointer to the data.
+     */
+    data_view(const char* ptr) noexcept // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
+        : m_data{ptr},
+          m_size{std::strlen(ptr)} {
+    }
+
+    /**
+     * Swap the contents of this object with the other.
+     *
+     * @param other Other object to swap data with.
+     */
+    void swap(data_view& other) noexcept {
+        using std::swap;
+        swap(m_data, other.m_data);
+        swap(m_size, other.m_size);
+    }
+
+    /// Return pointer to data.
+    constexpr const char* data() const noexcept {
+        return m_data;
+    }
+
+    /// Return length of data in bytes.
+    constexpr std::size_t size() const noexcept {
+        return m_size;
+    }
+
+    /// Returns true if size is 0.
+    constexpr bool empty() const noexcept {
+        return m_size == 0;
+    }
+
+#ifndef PROTOZERO_STRICT_API
+    /**
+     * Convert data view to string.
+     *
+     * @pre Must not be default constructed data_view.
+     *
+     * @deprecated to_string() is not available in C++17 string_view so it
+     *             should not be used to make conversion to that class easier
+     *             in the future.
+     */
+    std::string to_string() const {
+        protozero_assert(m_data);
+        return {m_data, m_size};
+    }
+#endif
+
+    /**
+     * Convert data view to string.
+     *
+     * @pre Must not be default constructed data_view.
+     */
+    explicit operator std::string() const {
+        protozero_assert(m_data);
+        return {m_data, m_size};
+    }
+
+    /**
+     * Compares the contents of this object with the given other object.
+     *
+     * @returns 0 if they are the same, <0 if this object is smaller than
+     *          the other or >0 if it is larger. If both objects have the
+     *          same size returns <0 if this object is lexicographically
+     *          before the other, >0 otherwise.
+     *
+     * @pre Must not be default constructed data_view.
+     */
+    int compare(data_view other) const noexcept {
+        assert(m_data && other.m_data);
+        const int cmp = std::memcmp(data(), other.data(),
+                                    std::min(size(), other.size()));
+        if (cmp == 0) {
+            if (size() == other.size()) {
+                return 0;
+            }
+            return size() < other.size() ? -1 : 1;
+        }
+        return cmp;
+    }
+
+}; // class data_view
+
+/**
+ * Swap two data_view objects.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline void swap(data_view& lhs, data_view& rhs) noexcept {
+    lhs.swap(rhs);
+}
+
+/**
+ * Two data_view instances are equal if they have the same size and the
+ * same content.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline constexpr bool operator==(const data_view lhs, const data_view rhs) noexcept {
+    return lhs.size() == rhs.size() &&
+           std::equal(lhs.data(), lhs.data() + lhs.size(), rhs.data());
+}
+
+/**
+ * Two data_view instances are not equal if they have different sizes or the
+ * content differs.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline constexpr bool operator!=(const data_view lhs, const data_view rhs) noexcept {
+    return !(lhs == rhs);
+}
+
+/**
+ * Returns true if lhs.compare(rhs) < 0.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline bool operator<(const data_view lhs, const data_view rhs) noexcept {
+    return lhs.compare(rhs) < 0;
+}
+
+/**
+ * Returns true if lhs.compare(rhs) <= 0.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline bool operator<=(const data_view lhs, const data_view rhs) noexcept {
+    return lhs.compare(rhs) <= 0;
+}
+
+/**
+ * Returns true if lhs.compare(rhs) > 0.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline bool operator>(const data_view lhs, const data_view rhs) noexcept {
+    return lhs.compare(rhs) > 0;
+}
+
+/**
+ * Returns true if lhs.compare(rhs) >= 0.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline bool operator>=(const data_view lhs, const data_view rhs) noexcept {
+    return lhs.compare(rhs) >= 0;
+}
+
+#endif
+
+} // end namespace protozero
+
+#endif // PROTOZERO_DATA_VIEW_HPP
diff --git a/include/protozero/exception.hpp b/include/protozero/exception.hpp
new file mode 100644
index 00000000..a3cd0f15
--- /dev/null
+++ b/include/protozero/exception.hpp
@@ -0,0 +1,101 @@
+#ifndef PROTOZERO_EXCEPTION_HPP
+#define PROTOZERO_EXCEPTION_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file exception.hpp
+ *
+ * @brief Contains the exceptions used in the protozero library.
+ */
+
+#include <exception>
+
+/**
+ * @brief All parts of the protozero header-only library are in this namespace.
+ */
+namespace protozero {
+
+/**
+ * All exceptions explicitly thrown by the functions of the protozero library
+ * derive from this exception.
+ */
+struct exception : std::exception {
+    /// Returns the explanatory string.
+    const char* what() const noexcept override {
+        return "pbf exception";
+    }
+};
+
+/**
+ * This exception is thrown when parsing a varint thats larger than allowed.
+ * This should never happen unless the data is corrupted.
+ */
+struct varint_too_long_exception : exception {
+    /// Returns the explanatory string.
+    const char* what() const noexcept override {
+        return "varint too long exception";
+    }
+};
+
+/**
+ * This exception is thrown when the wire type of a pdf field is unknown.
+ * This should never happen unless the data is corrupted.
+ */
+struct unknown_pbf_wire_type_exception : exception {
+    /// Returns the explanatory string.
+    const char* what() const noexcept override {
+        return "unknown pbf field type exception";
+    }
+};
+
+/**
+ * This exception is thrown when we are trying to read a field and there
+ * are not enough bytes left in the buffer to read it. Almost all functions
+ * of the pbf_reader class can throw this exception.
+ *
+ * This should never happen unless the data is corrupted or you have
+ * initialized the pbf_reader object with incomplete data.
+ */
+struct end_of_buffer_exception : exception {
+    /// Returns the explanatory string.
+    const char* what() const noexcept override {
+        return "end of buffer exception";
+    }
+};
+
+/**
+ * This exception is thrown when a tag has an invalid value. Tags must be
+ * unsigned integers between 1 and 2^29-1. Tags between 19000 and 19999 are
+ * not allowed. See
+ * https://developers.google.com/protocol-buffers/docs/proto#assigning-tags
+ */
+struct invalid_tag_exception : exception {
+    /// Returns the explanatory string.
+    const char* what() const noexcept override {
+        return "invalid tag exception";
+    }
+};
+
+/**
+ * This exception is thrown when a length field of a packed repeated field is
+ * invalid. For fixed size types the length must be a multiple of the size of
+ * the type.
+ */
+struct invalid_length_exception : exception {
+    /// Returns the explanatory string.
+    const char* what() const noexcept override {
+        return "invalid length exception";
+    }
+};
+
+} // end namespace protozero
+
+#endif // PROTOZERO_EXCEPTION_HPP
diff --git a/include/protozero/iterators.hpp b/include/protozero/iterators.hpp
new file mode 100644
index 00000000..ee8ef8ec
--- /dev/null
+++ b/include/protozero/iterators.hpp
@@ -0,0 +1,481 @@
+#ifndef PROTOZERO_ITERATORS_HPP
+#define PROTOZERO_ITERATORS_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file iterators.hpp
+ *
+ * @brief Contains the iterators for access to packed repeated fields.
+ */
+
+#include "config.hpp"
+#include "varint.hpp"
+
+#if PROTOZERO_BYTE_ORDER != PROTOZERO_LITTLE_ENDIAN
+# include <protozero/byteswap.hpp>
+#endif
+
+#include <algorithm>
+#include <cstring>
+#include <iterator>
+#include <utility>
+
+namespace protozero {
+
+/**
+ * A range of iterators based on std::pair. Created from beginning and
+ * end iterators. Used as a return type from some pbf_reader methods
+ * that is easy to use with range-based for loops.
+ */
+template <typename T, typename P = std::pair<T, T>>
+class iterator_range :
+#ifdef PROTOZERO_STRICT_API
+    protected
+#else
+    public
+#endif
+        P {
+
+public:
+
+    /// The type of the iterators in this range.
+    using iterator = T;
+
+    /// The value type of the underlying iterator.
+    using value_type = typename std::iterator_traits<T>::value_type;
+
+    /**
+     * Default constructor. Create empty iterator_range.
+     */
+    constexpr iterator_range() :
+        P{iterator{}, iterator{}} {
+    }
+
+    /**
+     * Create iterator range from two iterators.
+     *
+     * @param first_iterator Iterator to beginning of range.
+     * @param last_iterator Iterator to end of range.
+     */
+    constexpr iterator_range(iterator&& first_iterator, iterator&& last_iterator) :
+        P{std::forward<iterator>(first_iterator),
+          std::forward<iterator>(last_iterator)} {
+    }
+
+    /// Return iterator to beginning of range.
+    constexpr iterator begin() const noexcept {
+        return this->first;
+    }
+
+    /// Return iterator to end of range.
+    constexpr iterator end() const noexcept {
+        return this->second;
+    }
+
+    /// Return iterator to beginning of range.
+    constexpr iterator cbegin() const noexcept {
+        return this->first;
+    }
+
+    /// Return iterator to end of range.
+    constexpr iterator cend() const noexcept {
+        return this->second;
+    }
+
+    /**
+     * Return true if this range is empty.
+     *
+     * Complexity: Constant.
+     */
+    constexpr bool empty() const noexcept {
+        return begin() == end();
+    }
+
+    /**
+     * Get the size of the range, ie the number of elements it contains.
+     *
+     * Complexity: Constant or linear depending on the underlaying iterator.
+     */
+    std::size_t size() const noexcept {
+        return static_cast<size_t>(std::distance(begin(), end()));
+    }
+
+    /**
+     * Get element at the beginning of the range.
+     *
+     * @pre Range must not be empty.
+     */
+    value_type front() const {
+        protozero_assert(!empty());
+        return *(this->first);
+    }
+
+    /**
+     * Advance beginning of range by one.
+     *
+     * @pre Range must not be empty.
+     */
+    void drop_front() {
+        protozero_assert(!empty());
+        ++this->first;
+    }
+
+    /**
+     * Swap the contents of this range with the other.
+     *
+     * @param other Other range to swap data with.
+     */
+    void swap(iterator_range& other) noexcept {
+        using std::swap;
+        swap(this->first, other.first);
+        swap(this->second, other.second);
+    }
+
+}; // struct iterator_range
+
+/**
+ * Swap two iterator_ranges.
+ *
+ * @param lhs First range.
+ * @param rhs Second range.
+ */
+template <typename T>
+inline void swap(iterator_range<T>& lhs, iterator_range<T>& rhs) noexcept {
+    lhs.swap(rhs);
+}
+
+/**
+ * A forward iterator used for accessing packed repeated fields of fixed
+ * length (fixed32, sfixed32, float, double).
+ */
+template <typename T>
+class const_fixed_iterator {
+
+    /// Pointer to current iterator position
+    const char* m_data = nullptr;
+
+public:
+
+    /// @cond usual iterator functions not documented
+
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type        = T;
+    using difference_type   = std::ptrdiff_t;
+    using pointer           = value_type*;
+    using reference         = value_type&;
+
+    const_fixed_iterator() noexcept = default;
+
+    explicit const_fixed_iterator(const char* data) noexcept :
+        m_data{data} {
+    }
+
+    const_fixed_iterator(const const_fixed_iterator&) noexcept = default;
+    const_fixed_iterator(const_fixed_iterator&&) noexcept = default;
+
+    const_fixed_iterator& operator=(const const_fixed_iterator&) noexcept = default;
+    const_fixed_iterator& operator=(const_fixed_iterator&&) noexcept = default;
+
+    ~const_fixed_iterator() noexcept = default;
+
+    value_type operator*() const noexcept {
+        value_type result;
+        std::memcpy(&result, m_data, sizeof(value_type));
+#if PROTOZERO_BYTE_ORDER != PROTOZERO_LITTLE_ENDIAN
+        byteswap_inplace(&result);
+#endif
+        return result;
+    }
+
+    const_fixed_iterator& operator++() noexcept {
+        m_data += sizeof(value_type);
+        return *this;
+    }
+
+    const_fixed_iterator operator++(int) noexcept {
+        const const_fixed_iterator tmp{*this};
+        ++(*this);
+        return tmp;
+    }
+
+    const_fixed_iterator& operator--() noexcept {
+        m_data -= sizeof(value_type);
+        return *this;
+    }
+
+    const_fixed_iterator operator--(int) noexcept {
+        const const_fixed_iterator tmp{*this};
+        --(*this);
+        return tmp;
+    }
+
+    friend bool operator==(const_fixed_iterator lhs, const_fixed_iterator rhs) noexcept {
+        return lhs.m_data == rhs.m_data;
+    }
+
+    friend bool operator!=(const_fixed_iterator lhs, const_fixed_iterator rhs) noexcept {
+        return !(lhs == rhs);
+    }
+
+    friend bool operator<(const_fixed_iterator lhs, const_fixed_iterator rhs) noexcept {
+        return lhs.m_data < rhs.m_data;
+    }
+
+    friend bool operator>(const_fixed_iterator lhs, const_fixed_iterator rhs) noexcept {
+        return rhs < lhs;
+    }
+
+    friend bool operator<=(const_fixed_iterator lhs, const_fixed_iterator rhs) noexcept {
+        return !(lhs > rhs);
+    }
+
+    friend bool operator>=(const_fixed_iterator lhs, const_fixed_iterator rhs) noexcept {
+        return !(lhs < rhs);
+    }
+
+    const_fixed_iterator& operator+=(difference_type val) noexcept {
+        m_data += (sizeof(value_type) * val);
+        return *this;
+    }
+
+    friend const_fixed_iterator operator+(const_fixed_iterator lhs, difference_type rhs) noexcept {
+        const_fixed_iterator tmp{lhs};
+        tmp.m_data += (sizeof(value_type) * rhs);
+        return tmp;
+    }
+
+    friend const_fixed_iterator operator+(difference_type lhs, const_fixed_iterator rhs) noexcept {
+        const_fixed_iterator tmp{rhs};
+        tmp.m_data += (sizeof(value_type) * lhs);
+        return tmp;
+    }
+
+    const_fixed_iterator& operator-=(difference_type val) noexcept {
+        m_data -= (sizeof(value_type) * val);
+        return *this;
+    }
+
+    friend const_fixed_iterator operator-(const_fixed_iterator lhs, difference_type rhs) noexcept {
+        const_fixed_iterator tmp{lhs};
+        tmp.m_data -= (sizeof(value_type) * rhs);
+        return tmp;
+    }
+
+    friend difference_type operator-(const_fixed_iterator lhs, const_fixed_iterator rhs) noexcept {
+        return static_cast<difference_type>(lhs.m_data - rhs.m_data) / static_cast<difference_type>(sizeof(T));
+    }
+
+    value_type operator[](difference_type n) const noexcept {
+        return *(*this + n);
+    }
+
+    /// @endcond
+
+}; // class const_fixed_iterator
+
+/**
+ * A forward iterator used for accessing packed repeated varint fields
+ * (int32, uint32, int64, uint64, bool, enum).
+ */
+template <typename T>
+class const_varint_iterator {
+
+protected:
+
+    /// Pointer to current iterator position
+    const char* m_data = nullptr; // NOLINT(misc-non-private-member-variables-in-classes, cppcoreguidelines-non-private-member-variables-in-classes,-warnings-as-errors)
+
+    /// Pointer to end iterator position
+    const char* m_end = nullptr; // NOLINT(misc-non-private-member-variables-in-classes, cppcoreguidelines-non-private-member-variables-in-classes,-warnings-as-errors)
+
+public:
+
+    /// @cond usual iterator functions not documented
+
+    using iterator_category = std::forward_iterator_tag;
+    using value_type        = T;
+    using difference_type   = std::ptrdiff_t;
+    using pointer           = value_type*;
+    using reference         = value_type&;
+
+    static difference_type distance(const_varint_iterator begin, const_varint_iterator end) noexcept {
+        // The "distance" between default initialized const_varint_iterator's
+        // is always 0.
+        if (!begin.m_data) {
+            return 0;
+        }
+        // We know that each varint contains exactly one byte with the most
+        // significant bit not set. We can use this to quickly figure out
+        // how many varints there are without actually decoding the varints.
+        return std::count_if(begin.m_data, end.m_data, [](char c) noexcept {
+            return (static_cast<unsigned char>(c) & 0x80U) == 0;
+        });
+    }
+
+    const_varint_iterator() noexcept = default;
+
+    const_varint_iterator(const char* data, const char* end) noexcept :
+        m_data{data},
+        m_end{end} {
+    }
+
+    const_varint_iterator(const const_varint_iterator&) noexcept = default;
+    const_varint_iterator(const_varint_iterator&&) noexcept = default;
+
+    const_varint_iterator& operator=(const const_varint_iterator&) noexcept = default;
+    const_varint_iterator& operator=(const_varint_iterator&&) noexcept = default;
+
+    ~const_varint_iterator() noexcept = default;
+
+    value_type operator*() const {
+        protozero_assert(m_data);
+        const char* d = m_data; // will be thrown away
+        return static_cast<value_type>(decode_varint(&d, m_end));
+    }
+
+    const_varint_iterator& operator++() {
+        protozero_assert(m_data);
+        skip_varint(&m_data, m_end);
+        return *this;
+    }
+
+    const_varint_iterator operator++(int) {
+        protozero_assert(m_data);
+        const const_varint_iterator tmp{*this};
+        ++(*this);
+        return tmp;
+    }
+
+    bool operator==(const const_varint_iterator& rhs) const noexcept {
+        return m_data == rhs.m_data && m_end == rhs.m_end;
+    }
+
+    bool operator!=(const const_varint_iterator& rhs) const noexcept {
+        return !(*this == rhs);
+    }
+
+    /// @endcond
+
+}; // class const_varint_iterator
+
+/**
+ * A forward iterator used for accessing packed repeated svarint fields
+ * (sint32, sint64).
+ */
+template <typename T>
+class const_svarint_iterator : public const_varint_iterator<T> {
+
+public:
+
+    /// @cond usual iterator functions not documented
+
+    using iterator_category = std::forward_iterator_tag;
+    using value_type        = T;
+    using difference_type   = std::ptrdiff_t;
+    using pointer           = value_type*;
+    using reference         = value_type&;
+
+    const_svarint_iterator() noexcept :
+        const_varint_iterator<T>{} {
+    }
+
+    const_svarint_iterator(const char* data, const char* end) noexcept :
+        const_varint_iterator<T>{data, end} {
+    }
+
+    const_svarint_iterator(const const_svarint_iterator&) = default;
+    const_svarint_iterator(const_svarint_iterator&&) noexcept = default;
+
+    const_svarint_iterator& operator=(const const_svarint_iterator&) = default;
+    const_svarint_iterator& operator=(const_svarint_iterator&&) noexcept = default;
+
+    ~const_svarint_iterator() = default;
+
+    value_type operator*() const {
+        protozero_assert(this->m_data);
+        const char* d = this->m_data; // will be thrown away
+        return static_cast<value_type>(decode_zigzag64(decode_varint(&d, this->m_end)));
+    }
+
+    const_svarint_iterator& operator++() {
+        protozero_assert(this->m_data);
+        skip_varint(&this->m_data, this->m_end);
+        return *this;
+    }
+
+    const_svarint_iterator operator++(int) {
+        protozero_assert(this->m_data);
+        const const_svarint_iterator tmp{*this};
+        ++(*this);
+        return tmp;
+    }
+
+    /// @endcond
+
+}; // class const_svarint_iterator
+
+} // end namespace protozero
+
+namespace std {
+
+    // Specialize std::distance for all the protozero iterators. Because
+    // functions can't be partially specialized, we have to do this for
+    // every value_type we are using.
+
+    /// @cond individual overloads do not need to be documented
+
+    template <>
+    inline typename protozero::const_varint_iterator<int32_t>::difference_type
+    distance<protozero::const_varint_iterator<int32_t>>(protozero::const_varint_iterator<int32_t> first, // NOLINT(readability-inconsistent-declaration-parameter-name)
+                                                        protozero::const_varint_iterator<int32_t> last) {
+        return protozero::const_varint_iterator<int32_t>::distance(first, last);
+    }
+
+    template <>
+    inline typename protozero::const_varint_iterator<int64_t>::difference_type
+    distance<protozero::const_varint_iterator<int64_t>>(protozero::const_varint_iterator<int64_t> first, // NOLINT(readability-inconsistent-declaration-parameter-name)
+                                                        protozero::const_varint_iterator<int64_t> last) {
+        return protozero::const_varint_iterator<int64_t>::distance(first, last);
+    }
+
+    template <>
+    inline typename protozero::const_varint_iterator<uint32_t>::difference_type
+    distance<protozero::const_varint_iterator<uint32_t>>(protozero::const_varint_iterator<uint32_t> first, // NOLINT(readability-inconsistent-declaration-parameter-name)
+                                                         protozero::const_varint_iterator<uint32_t> last) {
+        return protozero::const_varint_iterator<uint32_t>::distance(first, last);
+    }
+
+    template <>
+    inline typename protozero::const_varint_iterator<uint64_t>::difference_type
+    distance<protozero::const_varint_iterator<uint64_t>>(protozero::const_varint_iterator<uint64_t> first, // NOLINT(readability-inconsistent-declaration-parameter-name)
+                                                         protozero::const_varint_iterator<uint64_t> last) {
+        return protozero::const_varint_iterator<uint64_t>::distance(first, last);
+    }
+
+    template <>
+    inline typename protozero::const_svarint_iterator<int32_t>::difference_type
+    distance<protozero::const_svarint_iterator<int32_t>>(protozero::const_svarint_iterator<int32_t> first, // NOLINT(readability-inconsistent-declaration-parameter-name)
+                                                         protozero::const_svarint_iterator<int32_t> last) {
+        return protozero::const_svarint_iterator<int32_t>::distance(first, last);
+    }
+
+    template <>
+    inline typename protozero::const_svarint_iterator<int64_t>::difference_type
+    distance<protozero::const_svarint_iterator<int64_t>>(protozero::const_svarint_iterator<int64_t> first, // NOLINT(readability-inconsistent-declaration-parameter-name)
+                                                         protozero::const_svarint_iterator<int64_t> last) {
+        return protozero::const_svarint_iterator<int64_t>::distance(first, last);
+    }
+
+    /// @endcond
+
+} // end namespace std
+
+#endif // PROTOZERO_ITERATORS_HPP
diff --git a/include/protozero/pbf_builder.hpp b/include/protozero/pbf_builder.hpp
new file mode 100644
index 00000000..71a2dec2
--- /dev/null
+++ b/include/protozero/pbf_builder.hpp
@@ -0,0 +1,32 @@
+#ifndef PROTOZERO_PBF_BUILDER_HPP
+#define PROTOZERO_PBF_BUILDER_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file pbf_builder.hpp
+ *
+ * @brief Contains the pbf_builder template class.
+ */
+
+#include "basic_pbf_builder.hpp"
+#include "pbf_writer.hpp"
+
+#include <string>
+
+namespace protozero {
+
+/// Specialization of basic_pbf_builder using std::string as buffer type.
+template <typename T>
+using pbf_builder = basic_pbf_builder<std::string, T>;
+
+} // end namespace protozero
+
+#endif // PROTOZERO_PBF_BUILDER_HPP
diff --git a/include/protozero/pbf_message.hpp b/include/protozero/pbf_message.hpp
new file mode 100644
index 00000000..d7fd8b5d
--- /dev/null
+++ b/include/protozero/pbf_message.hpp
@@ -0,0 +1,184 @@
+#ifndef PROTOZERO_PBF_MESSAGE_HPP
+#define PROTOZERO_PBF_MESSAGE_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file pbf_message.hpp
+ *
+ * @brief Contains the pbf_message template class.
+ */
+
+#include "pbf_reader.hpp"
+#include "types.hpp"
+
+#include <type_traits>
+
+namespace protozero {
+
+/**
+ * This class represents a protobuf message. Either a top-level message or
+ * a nested sub-message. Top-level messages can be created from any buffer
+ * with a pointer and length:
+ *
+ * @code
+ *    enum class Message : protozero::pbf_tag_type {
+ *       ...
+ *    };
+ *
+ *    std::string buffer;
+ *    // fill buffer...
+ *    pbf_message<Message> message{buffer.data(), buffer.size()};
+ * @endcode
+ *
+ * Sub-messages are created using get_message():
+ *
+ * @code
+ *    enum class SubMessage : protozero::pbf_tag_type {
+ *       ...
+ *    };
+ *
+ *    pbf_message<Message> message{...};
+ *    message.next();
+ *    pbf_message<SubMessage> submessage = message.get_message();
+ * @endcode
+ *
+ * All methods of the pbf_message class except get_bytes() and get_string()
+ * provide the strong exception guarantee, ie they either succeed or do not
+ * change the pbf_message object they are called on. Use the get_data() method
+ * instead of get_bytes() or get_string(), if you need this guarantee.
+ *
+ * This template class is based on the pbf_reader class and has all the same
+ * methods. The difference is that whereever the pbf_reader class takes an
+ * integer tag, this template class takes a tag of the template type T.
+ *
+ * Read the tutorial to understand how this class is used.
+ */
+template <typename T>
+class pbf_message : public pbf_reader {
+
+    static_assert(std::is_same<pbf_tag_type, typename std::underlying_type<T>::type>::value,
+                  "T must be enum with underlying type protozero::pbf_tag_type");
+
+public:
+
+    /// The type of messages this class will read.
+    using enum_type = T;
+
+    /**
+     * Construct a pbf_message. All arguments are forwarded to the pbf_reader
+     * parent class.
+     */
+    template <typename... Args>
+    pbf_message(Args&&... args) noexcept : // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
+        pbf_reader{std::forward<Args>(args)...} {
+    }
+
+    /**
+     * Set next field in the message as the current field. This is usually
+     * called in a while loop:
+     *
+     * @code
+     *    pbf_message<...> message(...);
+     *    while (message.next()) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * @returns `true` if there is a next field, `false` if not.
+     * @pre There must be no current field.
+     * @post If it returns `true` there is a current field now.
+     */
+    bool next() {
+        return pbf_reader::next();
+    }
+
+    /**
+     * Set next field with given tag in the message as the current field.
+     * Fields with other tags are skipped. This is usually called in a while
+     * loop for repeated fields:
+     *
+     * @code
+     *    pbf_message<Example1> message{...};
+     *    while (message.next(Example1::repeated_fixed64_r)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * or you can call it just once to get the one field with this tag:
+     *
+     * @code
+     *    pbf_message<Example1> message{...};
+     *    if (message.next(Example1::required_uint32_x)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * Note that this will not check the wire type. The two-argument version
+     * of this function will also check the wire type.
+     *
+     * @returns `true` if there is a next field with this tag.
+     * @pre There must be no current field.
+     * @post If it returns `true` there is a current field now with the given tag.
+     */
+    bool next(T next_tag) {
+        return pbf_reader::next(pbf_tag_type(next_tag));
+    }
+
+    /**
+     * Set next field with given tag and wire type in the message as the
+     * current field. Fields with other tags are skipped. This is usually
+     * called in a while loop for repeated fields:
+     *
+     * @code
+     *    pbf_message<Example1> message{...};
+     *    while (message.next(Example1::repeated_fixed64_r, pbf_wire_type::varint)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * or you can call it just once to get the one field with this tag:
+     *
+     * @code
+     *    pbf_message<Example1> message{...};
+     *    if (message.next(Example1::required_uint32_x, pbf_wire_type::varint)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * Note that this will also check the wire type. The one-argument version
+     * of this function will not check the wire type.
+     *
+     * @returns `true` if there is a next field with this tag.
+     * @pre There must be no current field.
+     * @post If it returns `true` there is a current field now with the given tag.
+     */
+    bool next(T next_tag, pbf_wire_type type) {
+        return pbf_reader::next(pbf_tag_type(next_tag), type);
+    }
+
+    /**
+     * The tag of the current field. The tag is the enum value for the field
+     * number from the description in the .proto file.
+     *
+     * Call next() before calling this function to set the current field.
+     *
+     * @returns tag of the current field.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     */
+    T tag() const noexcept {
+        return T(pbf_reader::tag());
+    }
+
+}; // class pbf_message
+
+} // end namespace protozero
+
+#endif // PROTOZERO_PBF_MESSAGE_HPP
diff --git a/include/protozero/pbf_reader.hpp b/include/protozero/pbf_reader.hpp
new file mode 100644
index 00000000..92bfdee5
--- /dev/null
+++ b/include/protozero/pbf_reader.hpp
@@ -0,0 +1,977 @@
+#ifndef PROTOZERO_PBF_READER_HPP
+#define PROTOZERO_PBF_READER_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file pbf_reader.hpp
+ *
+ * @brief Contains the pbf_reader class.
+ */
+
+#include "config.hpp"
+#include "data_view.hpp"
+#include "exception.hpp"
+#include "iterators.hpp"
+#include "types.hpp"
+#include "varint.hpp"
+
+#if PROTOZERO_BYTE_ORDER != PROTOZERO_LITTLE_ENDIAN
+# include <protozero/byteswap.hpp>
+#endif
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <utility>
+
+namespace protozero {
+
+/**
+ * This class represents a protobuf message. Either a top-level message or
+ * a nested sub-message. Top-level messages can be created from any buffer
+ * with a pointer and length:
+ *
+ * @code
+ *    std::string buffer;
+ *    // fill buffer...
+ *    pbf_reader message{buffer.data(), buffer.size()};
+ * @endcode
+ *
+ * Sub-messages are created using get_message():
+ *
+ * @code
+ *    pbf_reader message{...};
+ *    message.next();
+ *    pbf_reader submessage = message.get_message();
+ * @endcode
+ *
+ * All methods of the pbf_reader class except get_bytes() and get_string()
+ * provide the strong exception guarantee, ie they either succeed or do not
+ * change the pbf_reader object they are called on. Use the get_view() method
+ * instead of get_bytes() or get_string(), if you need this guarantee.
+ */
+class pbf_reader {
+
+    // A pointer to the next unread data.
+    const char* m_data = nullptr;
+
+    // A pointer to one past the end of data.
+    const char* m_end = nullptr;
+
+    // The wire type of the current field.
+    pbf_wire_type m_wire_type = pbf_wire_type::unknown;
+
+    // The tag of the current field.
+    pbf_tag_type m_tag = 0;
+
+    template <typename T>
+    T get_fixed() {
+        T result;
+        const char* data = m_data;
+        skip_bytes(sizeof(T));
+        std::memcpy(&result, data, sizeof(T));
+#if PROTOZERO_BYTE_ORDER != PROTOZERO_LITTLE_ENDIAN
+        byteswap_inplace(&result);
+#endif
+        return result;
+    }
+
+    template <typename T>
+    iterator_range<const_fixed_iterator<T>> packed_fixed() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        const auto len = get_len_and_skip();
+        if (len % sizeof(T) != 0) {
+            throw invalid_length_exception{};
+        }
+        return {const_fixed_iterator<T>(m_data - len),
+                const_fixed_iterator<T>(m_data)};
+    }
+
+    template <typename T>
+    T get_varint() {
+        const auto val = static_cast<T>(decode_varint(&m_data, m_end));
+        return val;
+    }
+
+    template <typename T>
+    T get_svarint() {
+        protozero_assert((has_wire_type(pbf_wire_type::varint) || has_wire_type(pbf_wire_type::length_delimited)) && "not a varint");
+        return static_cast<T>(decode_zigzag64(decode_varint(&m_data, m_end)));
+    }
+
+    pbf_length_type get_length() {
+        return get_varint<pbf_length_type>();
+    }
+
+    void skip_bytes(pbf_length_type len) {
+        if (m_end - m_data < static_cast<ptrdiff_t>(len)) {
+            throw end_of_buffer_exception{};
+        }
+        m_data += len;
+
+#ifndef NDEBUG
+        // In debug builds reset the tag to zero so that we can detect (some)
+        // wrong code.
+        m_tag = 0;
+#endif
+    }
+
+    pbf_length_type get_len_and_skip() {
+        const auto len = get_length();
+        skip_bytes(len);
+        return len;
+    }
+
+    template <typename T>
+    iterator_range<T> get_packed() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        const auto len = get_len_and_skip();
+        return {T{m_data - len, m_data},
+                T{m_data, m_data}};
+    }
+
+public:
+
+    /**
+     * Construct a pbf_reader message from a data_view. The pointer from the
+     * data_view will be stored inside the pbf_reader object, no data is
+     * copied. So you must make sure the view stays valid as long as the
+     * pbf_reader object is used.
+     *
+     * The buffer must contain a complete protobuf message.
+     *
+     * @post There is no current field.
+     */
+    explicit pbf_reader(const data_view& view) noexcept
+        : m_data{view.data()},
+          m_end{view.data() + view.size()} {
+    }
+
+    /**
+     * Construct a pbf_reader message from a data pointer and a length. The
+     * pointer will be stored inside the pbf_reader object, no data is copied.
+     * So you must make sure the buffer stays valid as long as the pbf_reader
+     * object is used.
+     *
+     * The buffer must contain a complete protobuf message.
+     *
+     * @post There is no current field.
+     */
+    pbf_reader(const char* data, std::size_t size) noexcept
+        : m_data{data},
+          m_end{data + size} {
+    }
+
+#ifndef PROTOZERO_STRICT_API
+    /**
+     * Construct a pbf_reader message from a data pointer and a length. The
+     * pointer will be stored inside the pbf_reader object, no data is copied.
+     * So you must make sure the buffer stays valid as long as the pbf_reader
+     * object is used.
+     *
+     * The buffer must contain a complete protobuf message.
+     *
+     * @post There is no current field.
+     * @deprecated Use one of the other constructors.
+     */
+    explicit pbf_reader(const std::pair<const char*, std::size_t>& data) noexcept
+        : m_data{data.first},
+          m_end{data.first + data.second} {
+    }
+#endif
+
+    /**
+     * Construct a pbf_reader message from a std::string. A pointer to the
+     * string internals will be stored inside the pbf_reader object, no data
+     * is copied. So you must make sure the string is unchanged as long as the
+     * pbf_reader object is used.
+     *
+     * The string must contain a complete protobuf message.
+     *
+     * @post There is no current field.
+     */
+    explicit pbf_reader(const std::string& data) noexcept
+        : m_data{data.data()},
+          m_end{data.data() + data.size()} {
+    }
+
+    /**
+     * pbf_reader can be default constructed and behaves like it has an empty
+     * buffer.
+     */
+    pbf_reader() noexcept = default;
+
+    /// pbf_reader messages can be copied trivially.
+    pbf_reader(const pbf_reader&) noexcept = default;
+
+    /// pbf_reader messages can be moved trivially.
+    pbf_reader(pbf_reader&&) noexcept = default;
+
+    /// pbf_reader messages can be copied trivially.
+    pbf_reader& operator=(const pbf_reader& other) noexcept = default;
+
+    /// pbf_reader messages can be moved trivially.
+    pbf_reader& operator=(pbf_reader&& other) noexcept = default;
+
+    ~pbf_reader() = default;
+
+    /**
+     * Swap the contents of this object with the other.
+     *
+     * @param other Other object to swap data with.
+     */
+    void swap(pbf_reader& other) noexcept {
+        using std::swap;
+        swap(m_data, other.m_data);
+        swap(m_end, other.m_end);
+        swap(m_wire_type, other.m_wire_type);
+        swap(m_tag, other.m_tag);
+    }
+
+    /**
+     * In a boolean context the pbf_reader class evaluates to `true` if there
+     * are still fields available and to `false` if the last field has been
+     * read.
+     */
+    operator bool() const noexcept { // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
+        return m_data != m_end;
+    }
+
+    /**
+     * Get a view of the not yet read data.
+     */
+    data_view data() const noexcept {
+        return {m_data, static_cast<std::size_t>(m_end - m_data)};
+    }
+
+    /**
+     * Return the length in bytes of the current message. If you have
+     * already called next() and/or any of the get_*() functions, this will
+     * return the remaining length.
+     *
+     * This can, for instance, be used to estimate the space needed for a
+     * buffer. Of course you have to know reasonably well what data to expect
+     * and how it is encoded for this number to have any meaning.
+     */
+    std::size_t length() const noexcept {
+        return std::size_t(m_end - m_data);
+    }
+
+    /**
+     * Set next field in the message as the current field. This is usually
+     * called in a while loop:
+     *
+     * @code
+     *    pbf_reader message(...);
+     *    while (message.next()) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * @returns `true` if there is a next field, `false` if not.
+     * @pre There must be no current field.
+     * @post If it returns `true` there is a current field now.
+     */
+    bool next() {
+        if (m_data == m_end) {
+            return false;
+        }
+
+        const auto value = get_varint<uint32_t>();
+        m_tag = pbf_tag_type(value >> 3U);
+
+        // tags 0 and 19000 to 19999 are not allowed as per
+        // https://developers.google.com/protocol-buffers/docs/proto#assigning-tags
+        if (m_tag == 0 || (m_tag >= 19000 && m_tag <= 19999)) {
+            throw invalid_tag_exception{};
+        }
+
+        m_wire_type = pbf_wire_type(value & 0x07U);
+        switch (m_wire_type) {
+            case pbf_wire_type::varint:
+            case pbf_wire_type::fixed64:
+            case pbf_wire_type::length_delimited:
+            case pbf_wire_type::fixed32:
+                break;
+            default:
+                throw unknown_pbf_wire_type_exception{};
+        }
+
+        return true;
+    }
+
+    /**
+     * Set next field with given tag in the message as the current field.
+     * Fields with other tags are skipped. This is usually called in a while
+     * loop for repeated fields:
+     *
+     * @code
+     *    pbf_reader message{...};
+     *    while (message.next(17)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * or you can call it just once to get the one field with this tag:
+     *
+     * @code
+     *    pbf_reader message{...};
+     *    if (message.next(17)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * Note that this will not check the wire type. The two-argument version
+     * of this function will also check the wire type.
+     *
+     * @returns `true` if there is a next field with this tag.
+     * @pre There must be no current field.
+     * @post If it returns `true` there is a current field now with the given tag.
+     */
+    bool next(pbf_tag_type next_tag) {
+        while (next()) {
+            if (m_tag == next_tag) {
+                return true;
+            }
+            skip();
+        }
+        return false;
+    }
+
+    /**
+     * Set next field with given tag and wire type in the message as the
+     * current field. Fields with other tags are skipped. This is usually
+     * called in a while loop for repeated fields:
+     *
+     * @code
+     *    pbf_reader message{...};
+     *    while (message.next(17, pbf_wire_type::varint)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * or you can call it just once to get the one field with this tag:
+     *
+     * @code
+     *    pbf_reader message{...};
+     *    if (message.next(17, pbf_wire_type::varint)) {
+     *        // handle field
+     *    }
+     * @endcode
+     *
+     * Note that this will also check the wire type. The one-argument version
+     * of this function will not check the wire type.
+     *
+     * @returns `true` if there is a next field with this tag.
+     * @pre There must be no current field.
+     * @post If it returns `true` there is a current field now with the given tag.
+     */
+    bool next(pbf_tag_type next_tag, pbf_wire_type type) {
+        while (next()) {
+            if (m_tag == next_tag && m_wire_type == type) {
+                return true;
+            }
+            skip();
+        }
+        return false;
+    }
+
+    /**
+     * The tag of the current field. The tag is the field number from the
+     * description in the .proto file.
+     *
+     * Call next() before calling this function to set the current field.
+     *
+     * @returns tag of the current field.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     */
+    pbf_tag_type tag() const noexcept {
+        return m_tag;
+    }
+
+    /**
+     * Get the wire type of the current field. The wire types are:
+     *
+     * * 0 - varint
+     * * 1 - 64 bit
+     * * 2 - length-delimited
+     * * 5 - 32 bit
+     *
+     * All other types are illegal.
+     *
+     * Call next() before calling this function to set the current field.
+     *
+     * @returns wire type of the current field.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     */
+    pbf_wire_type wire_type() const noexcept {
+        return m_wire_type;
+    }
+
+    /**
+     * Get the tag and wire type of the current field in one integer suitable
+     * for comparison with a switch statement.
+     *
+     * Use it like this:
+     *
+     * @code
+     *    pbf_reader message{...};
+     *    while (message.next()) {
+     *        switch (message.tag_and_type()) {
+     *            case tag_and_type(17, pbf_wire_type::length_delimited):
+     *                ....
+     *                break;
+     *            case tag_and_type(21, pbf_wire_type::varint):
+     *                ....
+     *                break;
+     *            default:
+     *                message.skip();
+     *        }
+     *    }
+     * @endcode
+     */
+    uint32_t tag_and_type() const noexcept {
+        return protozero::tag_and_type(tag(), wire_type());
+    }
+
+    /**
+     * Check the wire type of the current field.
+     *
+     * @returns `true` if the current field has the given wire type.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     */
+    bool has_wire_type(pbf_wire_type type) const noexcept {
+        return wire_type() == type;
+    }
+
+    /**
+     * Consume the current field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @post The current field was consumed and there is no current field now.
+     */
+    void skip() {
+        protozero_assert(tag() != 0 && "call next() before calling skip()");
+        switch (wire_type()) {
+            case pbf_wire_type::varint:
+                skip_varint(&m_data, m_end);
+                break;
+            case pbf_wire_type::fixed64:
+                skip_bytes(8);
+                break;
+            case pbf_wire_type::length_delimited:
+                skip_bytes(get_length());
+                break;
+            case pbf_wire_type::fixed32:
+                skip_bytes(4);
+                break;
+            default:
+                break;
+        }
+    }
+
+    ///@{
+    /**
+     * @name Scalar field accessor functions
+     */
+
+    /**
+     * Consume and return value of current "bool" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "bool".
+     * @post The current field was consumed and there is no current field now.
+     */
+    bool get_bool() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        const bool result = m_data[0] != 0;
+        skip_varint(&m_data, m_end);
+        return result;
+    }
+
+    /**
+     * Consume and return value of current "enum" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "enum".
+     * @post The current field was consumed and there is no current field now.
+     */
+    int32_t get_enum() {
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        return get_varint<int32_t>();
+    }
+
+    /**
+     * Consume and return value of current "int32" varint field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "int32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    int32_t get_int32() {
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        return get_varint<int32_t>();
+    }
+
+    /**
+     * Consume and return value of current "sint32" varint field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "sint32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    int32_t get_sint32() {
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        return get_svarint<int32_t>();
+    }
+
+    /**
+     * Consume and return value of current "uint32" varint field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "uint32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    uint32_t get_uint32() {
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        return get_varint<uint32_t>();
+    }
+
+    /**
+     * Consume and return value of current "int64" varint field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "int64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    int64_t get_int64() {
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        return get_varint<int64_t>();
+    }
+
+    /**
+     * Consume and return value of current "sint64" varint field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "sint64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    int64_t get_sint64() {
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        return get_svarint<int64_t>();
+    }
+
+    /**
+     * Consume and return value of current "uint64" varint field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "uint64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    uint64_t get_uint64() {
+        protozero_assert(has_wire_type(pbf_wire_type::varint) && "not a varint");
+        return get_varint<uint64_t>();
+    }
+
+    /**
+     * Consume and return value of current "fixed32" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "fixed32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    uint32_t get_fixed32() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::fixed32) && "not a 32-bit fixed");
+        return get_fixed<uint32_t>();
+    }
+
+    /**
+     * Consume and return value of current "sfixed32" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "sfixed32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    int32_t get_sfixed32() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::fixed32) && "not a 32-bit fixed");
+        return get_fixed<int32_t>();
+    }
+
+    /**
+     * Consume and return value of current "fixed64" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "fixed64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    uint64_t get_fixed64() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::fixed64) && "not a 64-bit fixed");
+        return get_fixed<uint64_t>();
+    }
+
+    /**
+     * Consume and return value of current "sfixed64" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "sfixed64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    int64_t get_sfixed64() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::fixed64) && "not a 64-bit fixed");
+        return get_fixed<int64_t>();
+    }
+
+    /**
+     * Consume and return value of current "float" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "float".
+     * @post The current field was consumed and there is no current field now.
+     */
+    float get_float() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::fixed32) && "not a 32-bit fixed");
+        return get_fixed<float>();
+    }
+
+    /**
+     * Consume and return value of current "double" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "double".
+     * @post The current field was consumed and there is no current field now.
+     */
+    double get_double() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::fixed64) && "not a 64-bit fixed");
+        return get_fixed<double>();
+    }
+
+    /**
+     * Consume and return value of current "bytes", "string", or "message"
+     * field.
+     *
+     * @returns A data_view object.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "bytes", "string", or "message".
+     * @post The current field was consumed and there is no current field now.
+     */
+    data_view get_view() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::length_delimited) && "not of type string, bytes or message");
+        const auto len = get_len_and_skip();
+        return {m_data - len, len};
+    }
+
+#ifndef PROTOZERO_STRICT_API
+    /**
+     * Consume and return value of current "bytes" or "string" field.
+     *
+     * @returns A pair with a pointer to the data and the length of the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "bytes" or "string".
+     * @post The current field was consumed and there is no current field now.
+     */
+    std::pair<const char*, pbf_length_type> get_data() {
+        protozero_assert(tag() != 0 && "call next() before accessing field value");
+        protozero_assert(has_wire_type(pbf_wire_type::length_delimited) && "not of type string, bytes or message");
+        const auto len = get_len_and_skip();
+        return {m_data - len, len};
+    }
+#endif
+
+    /**
+     * Consume and return value of current "bytes" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "bytes".
+     * @post The current field was consumed and there is no current field now.
+     */
+    std::string get_bytes() {
+        return std::string(get_view());
+    }
+
+    /**
+     * Consume and return value of current "string" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "string".
+     * @post The current field was consumed and there is no current field now.
+     */
+    std::string get_string() {
+        return std::string(get_view());
+    }
+
+    /**
+     * Consume and return value of current "message" field.
+     *
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "message".
+     * @post The current field was consumed and there is no current field now.
+     */
+    pbf_reader get_message() {
+        return pbf_reader{get_view()};
+    }
+
+    ///@}
+
+    /// Forward iterator for iterating over bool (int32 varint) values.
+    using const_bool_iterator   = const_varint_iterator< int32_t>;
+
+    /// Forward iterator for iterating over enum (int32 varint) values.
+    using const_enum_iterator   = const_varint_iterator< int32_t>;
+
+    /// Forward iterator for iterating over int32 (varint) values.
+    using const_int32_iterator  = const_varint_iterator< int32_t>;
+
+    /// Forward iterator for iterating over sint32 (varint) values.
+    using const_sint32_iterator = const_svarint_iterator<int32_t>;
+
+    /// Forward iterator for iterating over uint32 (varint) values.
+    using const_uint32_iterator = const_varint_iterator<uint32_t>;
+
+    /// Forward iterator for iterating over int64 (varint) values.
+    using const_int64_iterator  = const_varint_iterator< int64_t>;
+
+    /// Forward iterator for iterating over sint64 (varint) values.
+    using const_sint64_iterator = const_svarint_iterator<int64_t>;
+
+    /// Forward iterator for iterating over uint64 (varint) values.
+    using const_uint64_iterator = const_varint_iterator<uint64_t>;
+
+    /// Forward iterator for iterating over fixed32 values.
+    using const_fixed32_iterator = const_fixed_iterator<uint32_t>;
+
+    /// Forward iterator for iterating over sfixed32 values.
+    using const_sfixed32_iterator = const_fixed_iterator<int32_t>;
+
+    /// Forward iterator for iterating over fixed64 values.
+    using const_fixed64_iterator = const_fixed_iterator<uint64_t>;
+
+    /// Forward iterator for iterating over sfixed64 values.
+    using const_sfixed64_iterator = const_fixed_iterator<int64_t>;
+
+    /// Forward iterator for iterating over float values.
+    using const_float_iterator = const_fixed_iterator<float>;
+
+    /// Forward iterator for iterating over double values.
+    using const_double_iterator = const_fixed_iterator<double>;
+
+    ///@{
+    /**
+     * @name Repeated packed field accessor functions
+     */
+
+    /**
+     * Consume current "repeated packed bool" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed bool".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_bool_iterator> get_packed_bool() {
+        return get_packed<pbf_reader::const_bool_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed enum" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed enum".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_enum_iterator> get_packed_enum() {
+        return get_packed<pbf_reader::const_enum_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed int32" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed int32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_int32_iterator> get_packed_int32() {
+        return get_packed<pbf_reader::const_int32_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed sint32" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed sint32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_sint32_iterator> get_packed_sint32() {
+        return get_packed<pbf_reader::const_sint32_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed uint32" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed uint32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_uint32_iterator> get_packed_uint32() {
+        return get_packed<pbf_reader::const_uint32_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed int64" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed int64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_int64_iterator> get_packed_int64() {
+        return get_packed<pbf_reader::const_int64_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed sint64" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed sint64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_sint64_iterator> get_packed_sint64() {
+        return get_packed<pbf_reader::const_sint64_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed uint64" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed uint64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_uint64_iterator> get_packed_uint64() {
+        return get_packed<pbf_reader::const_uint64_iterator>();
+    }
+
+    /**
+     * Consume current "repeated packed fixed32" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed fixed32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_fixed32_iterator> get_packed_fixed32() {
+        return packed_fixed<uint32_t>();
+    }
+
+    /**
+     * Consume current "repeated packed sfixed32" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed sfixed32".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_sfixed32_iterator> get_packed_sfixed32() {
+        return packed_fixed<int32_t>();
+    }
+
+    /**
+     * Consume current "repeated packed fixed64" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed fixed64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_fixed64_iterator> get_packed_fixed64() {
+        return packed_fixed<uint64_t>();
+    }
+
+    /**
+     * Consume current "repeated packed sfixed64" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed sfixed64".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_sfixed64_iterator> get_packed_sfixed64() {
+        return packed_fixed<int64_t>();
+    }
+
+    /**
+     * Consume current "repeated packed float" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed float".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_float_iterator> get_packed_float() {
+        return packed_fixed<float>();
+    }
+
+    /**
+     * Consume current "repeated packed double" field.
+     *
+     * @returns a pair of iterators to the beginning and one past the end of
+     *          the data.
+     * @pre There must be a current field (ie. next() must have returned `true`).
+     * @pre The current field must be of type "repeated packed double".
+     * @post The current field was consumed and there is no current field now.
+     */
+    iterator_range<pbf_reader::const_double_iterator> get_packed_double() {
+        return packed_fixed<double>();
+    }
+
+    ///@}
+
+}; // class pbf_reader
+
+/**
+ * Swap two pbf_reader objects.
+ *
+ * @param lhs First object.
+ * @param rhs Second object.
+ */
+inline void swap(pbf_reader& lhs, pbf_reader& rhs) noexcept {
+    lhs.swap(rhs);
+}
+
+} // end namespace protozero
+
+#endif // PROTOZERO_PBF_READER_HPP
diff --git a/include/protozero/pbf_writer.hpp b/include/protozero/pbf_writer.hpp
new file mode 100644
index 00000000..9a07bd5b
--- /dev/null
+++ b/include/protozero/pbf_writer.hpp
@@ -0,0 +1,76 @@
+#ifndef PROTOZERO_PBF_WRITER_HPP
+#define PROTOZERO_PBF_WRITER_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file pbf_writer.hpp
+ *
+ * @brief Contains the pbf_writer class.
+ */
+
+#include "basic_pbf_writer.hpp"
+#include "buffer_string.hpp"
+
+#include <cstdint>
+#include <string>
+
+namespace protozero {
+
+/**
+ * Specialization of basic_pbf_writer using std::string as buffer type.
+ */
+using pbf_writer = basic_pbf_writer<std::string>;
+
+/// Class for generating packed repeated bool fields.
+using packed_field_bool     = detail::packed_field_varint<std::string, bool>;
+
+/// Class for generating packed repeated enum fields.
+using packed_field_enum     = detail::packed_field_varint<std::string, int32_t>;
+
+/// Class for generating packed repeated int32 fields.
+using packed_field_int32    = detail::packed_field_varint<std::string, int32_t>;
+
+/// Class for generating packed repeated sint32 fields.
+using packed_field_sint32   = detail::packed_field_svarint<std::string, int32_t>;
+
+/// Class for generating packed repeated uint32 fields.
+using packed_field_uint32   = detail::packed_field_varint<std::string, uint32_t>;
+
+/// Class for generating packed repeated int64 fields.
+using packed_field_int64    = detail::packed_field_varint<std::string, int64_t>;
+
+/// Class for generating packed repeated sint64 fields.
+using packed_field_sint64   = detail::packed_field_svarint<std::string, int64_t>;
+
+/// Class for generating packed repeated uint64 fields.
+using packed_field_uint64   = detail::packed_field_varint<std::string, uint64_t>;
+
+/// Class for generating packed repeated fixed32 fields.
+using packed_field_fixed32  = detail::packed_field_fixed<std::string, uint32_t>;
+
+/// Class for generating packed repeated sfixed32 fields.
+using packed_field_sfixed32 = detail::packed_field_fixed<std::string, int32_t>;
+
+/// Class for generating packed repeated fixed64 fields.
+using packed_field_fixed64  = detail::packed_field_fixed<std::string, uint64_t>;
+
+/// Class for generating packed repeated sfixed64 fields.
+using packed_field_sfixed64 = detail::packed_field_fixed<std::string, int64_t>;
+
+/// Class for generating packed repeated float fields.
+using packed_field_float    = detail::packed_field_fixed<std::string, float>;
+
+/// Class for generating packed repeated double fields.
+using packed_field_double   = detail::packed_field_fixed<std::string, double>;
+
+} // end namespace protozero
+
+#endif // PROTOZERO_PBF_WRITER_HPP
diff --git a/include/protozero/types.hpp b/include/protozero/types.hpp
new file mode 100644
index 00000000..3aefddfb
--- /dev/null
+++ b/include/protozero/types.hpp
@@ -0,0 +1,66 @@
+#ifndef PROTOZERO_TYPES_HPP
+#define PROTOZERO_TYPES_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file types.hpp
+ *
+ * @brief Contains the declaration of low-level types used in the pbf format.
+ */
+
+#include "config.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <utility>
+
+namespace protozero {
+
+/**
+ * The type used for field tags (field numbers).
+ */
+using pbf_tag_type = uint32_t;
+
+/**
+ * The type used to encode type information.
+ * See the table on
+ *    https://developers.google.com/protocol-buffers/docs/encoding
+ */
+enum class pbf_wire_type : uint32_t {
+    varint           = 0, // int32/64, uint32/64, sint32/64, bool, enum
+    fixed64          = 1, // fixed64, sfixed64, double
+    length_delimited = 2, // string, bytes, nested messages, packed repeated fields
+    fixed32          = 5, // fixed32, sfixed32, float
+    unknown          = 99 // used for default setting in this library
+};
+
+/**
+ * Get the tag and wire type of the current field in one integer suitable
+ * for comparison with a switch statement.
+ *
+ * See pbf_reader.tag_and_type() for an example how to use this.
+ */
+template <typename T>
+constexpr inline uint32_t tag_and_type(T tag, pbf_wire_type wire_type) noexcept {
+    return (static_cast<uint32_t>(static_cast<pbf_tag_type>(tag)) << 3U) | static_cast<uint32_t>(wire_type);
+}
+
+/**
+ * The type used for length values, such as the length of a field.
+ */
+using pbf_length_type = uint32_t;
+
+} // end namespace protozero
+
+#endif // PROTOZERO_TYPES_HPP
diff --git a/include/protozero/varint.hpp b/include/protozero/varint.hpp
new file mode 100644
index 00000000..b4648a44
--- /dev/null
+++ b/include/protozero/varint.hpp
@@ -0,0 +1,245 @@
+#ifndef PROTOZERO_VARINT_HPP
+#define PROTOZERO_VARINT_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file varint.hpp
+ *
+ * @brief Contains low-level varint and zigzag encoding and decoding functions.
+ */
+
+#include "buffer_tmpl.hpp"
+#include "exception.hpp"
+
+#include <cstdint>
+
+namespace protozero {
+
+/**
+ * The maximum length of a 64 bit varint.
+ */
+constexpr const int8_t max_varint_length = sizeof(uint64_t) * 8 / 7 + 1;
+
+namespace detail {
+
+    // from https://github.com/facebook/folly/blob/master/folly/Varint.h
+    inline uint64_t decode_varint_impl(const char** data, const char* end) {
+        const auto* begin = reinterpret_cast<const int8_t*>(*data);
+        const auto* iend = reinterpret_cast<const int8_t*>(end);
+        const int8_t* p = begin;
+        uint64_t val = 0;
+
+        if (iend - begin >= max_varint_length) {  // fast path
+            do {
+                int64_t b = *p++;
+                          val  = ((uint64_t(b) & 0x7fU)       ); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) <<  7U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) << 14U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) << 21U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) << 28U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) << 35U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) << 42U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) << 49U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x7fU) << 56U); if (b >= 0) { break; }
+                b = *p++; val |= ((uint64_t(b) & 0x01U) << 63U); if (b >= 0) { break; }
+                throw varint_too_long_exception{};
+            } while (false);
+        } else {
+            unsigned int shift = 0;
+            while (p != iend && *p < 0) {
+                val |= (uint64_t(*p++) & 0x7fU) << shift;
+                shift += 7;
+            }
+            if (p == iend) {
+                throw end_of_buffer_exception{};
+            }
+            val |= uint64_t(*p++) << shift;
+        }
+
+        *data = reinterpret_cast<const char*>(p);
+        return val;
+    }
+
+} // end namespace detail
+
+/**
+ * Decode a 64 bit varint.
+ *
+ * Strong exception guarantee: if there is an exception the data pointer will
+ * not be changed.
+ *
+ * @param[in,out] data Pointer to pointer to the input data. After the function
+ *        returns this will point to the next data to be read.
+ * @param[in] end Pointer one past the end of the input data.
+ * @returns The decoded integer
+ * @throws varint_too_long_exception if the varint is longer then the maximum
+ *         length that would fit in a 64 bit int. Usually this means your data
+ *         is corrupted or you are trying to read something as a varint that
+ *         isn't.
+ * @throws end_of_buffer_exception if the *end* of the buffer was reached
+ *         before the end of the varint.
+ */
+inline uint64_t decode_varint(const char** data, const char* end) {
+    // If this is a one-byte varint, decode it here.
+    if (end != *data && ((static_cast<uint64_t>(**data) & 0x80U) == 0)) {
+        const auto val = static_cast<uint64_t>(**data);
+        ++(*data);
+        return val;
+    }
+    // If this varint is more than one byte, defer to complete implementation.
+    return detail::decode_varint_impl(data, end);
+}
+
+/**
+ * Skip over a varint.
+ *
+ * Strong exception guarantee: if there is an exception the data pointer will
+ * not be changed.
+ *
+ * @param[in,out] data Pointer to pointer to the input data. After the function
+ *        returns this will point to the next data to be read.
+ * @param[in] end Pointer one past the end of the input data.
+ * @throws end_of_buffer_exception if the *end* of the buffer was reached
+ *         before the end of the varint.
+ */
+inline void skip_varint(const char** data, const char* end) {
+    const auto* begin = reinterpret_cast<const int8_t*>(*data);
+    const auto* iend = reinterpret_cast<const int8_t*>(end);
+    const int8_t* p = begin;
+
+    while (p != iend && *p < 0) {
+        ++p;
+    }
+
+    if (p - begin >= max_varint_length) {
+        throw varint_too_long_exception{};
+    }
+
+    if (p == iend) {
+        throw end_of_buffer_exception{};
+    }
+
+    ++p;
+
+    *data = reinterpret_cast<const char*>(p);
+}
+
+/**
+ * Varint encode a 64 bit integer.
+ *
+ * @tparam T An output iterator type.
+ * @param data Output iterator the varint encoded value will be written to
+ *             byte by byte.
+ * @param value The integer that will be encoded.
+ * @returns the number of bytes written
+ * @throws Any exception thrown by increment or dereference operator on data.
+ * @deprecated Use add_varint_to_buffer() instead.
+ */
+template <typename T>
+inline int write_varint(T data, uint64_t value) {
+    int n = 1;
+
+    while (value >= 0x80U) {
+        *data++ = char((value & 0x7fU) | 0x80U);
+        value >>= 7U;
+        ++n;
+    }
+    *data = char(value);
+
+    return n;
+}
+
+/**
+ * Varint encode a 64 bit integer.
+ *
+ * @tparam TBuffer A buffer type.
+ * @param buffer Output buffer the varint will be written to.
+ * @param value The integer that will be encoded.
+ * @returns the number of bytes written
+ * @throws Any exception thrown by calling the buffer_push_back() function.
+ */
+template <typename TBuffer>
+inline void add_varint_to_buffer(TBuffer* buffer, uint64_t value) {
+    while (value >= 0x80U) {
+        buffer_customization<TBuffer>::push_back(buffer, char((value & 0x7fU) | 0x80U));
+        value >>= 7U;
+    }
+    buffer_customization<TBuffer>::push_back(buffer, char(value));
+}
+
+/**
+ * Varint encode a 64 bit integer.
+ *
+ * @param data Where to add the varint. There must be enough space available!
+ * @param value The integer that will be encoded.
+ * @returns the number of bytes written
+ */
+inline int add_varint_to_buffer(char* data, uint64_t value) noexcept {
+    int n = 1;
+
+    while (value >= 0x80U) {
+        *data++ = char((value & 0x7fU) | 0x80U);
+        value >>= 7U;
+        ++n;
+    }
+    *data = char(value);
+
+    return n;
+}
+
+/**
+ * Get the length of the varint the specified value would produce.
+ *
+ * @param value The integer to be encoded.
+ * @returns the number of bytes the varint would have if we created it.
+ */
+inline int length_of_varint(uint64_t value) noexcept {
+    int n = 1;
+
+    while (value >= 0x80U) {
+        value >>= 7U;
+        ++n;
+    }
+
+    return n;
+}
+
+/**
+ * ZigZag encodes a 32 bit integer.
+ */
+inline constexpr uint32_t encode_zigzag32(int32_t value) noexcept {
+    return (static_cast<uint32_t>(value) << 1U) ^ static_cast<uint32_t>(-static_cast<int32_t>(static_cast<uint32_t>(value) >> 31U));
+}
+
+/**
+ * ZigZag encodes a 64 bit integer.
+ */
+inline constexpr uint64_t encode_zigzag64(int64_t value) noexcept {
+    return (static_cast<uint64_t>(value) << 1U) ^ static_cast<uint64_t>(-static_cast<int64_t>(static_cast<uint64_t>(value) >> 63U));
+}
+
+/**
+ * Decodes a 32 bit ZigZag-encoded integer.
+ */
+inline constexpr int32_t decode_zigzag32(uint32_t value) noexcept {
+    return static_cast<int32_t>((value >> 1U) ^ static_cast<uint32_t>(-static_cast<int32_t>(value & 1U)));
+}
+
+/**
+ * Decodes a 64 bit ZigZag-encoded integer.
+ */
+inline constexpr int64_t decode_zigzag64(uint64_t value) noexcept {
+    return static_cast<int64_t>((value >> 1U) ^ static_cast<uint64_t>(-static_cast<int64_t>(value & 1U)));
+}
+
+} // end namespace protozero
+
+#endif // PROTOZERO_VARINT_HPP
diff --git a/include/protozero/version.hpp b/include/protozero/version.hpp
new file mode 100644
index 00000000..fc9b9287
--- /dev/null
+++ b/include/protozero/version.hpp
@@ -0,0 +1,34 @@
+#ifndef PROTOZERO_VERSION_HPP
+#define PROTOZERO_VERSION_HPP
+
+/*****************************************************************************
+
+protozero - Minimalistic protocol buffer decoder and encoder in C++.
+
+This file is from https://github.com/mapbox/protozero where you can find more
+documentation.
+
+*****************************************************************************/
+
+/**
+ * @file version.hpp
+ *
+ * @brief Contains macros defining the protozero version.
+ */
+
+/// The major version number
+#define PROTOZERO_VERSION_MAJOR 1
+
+/// The minor version number
+#define PROTOZERO_VERSION_MINOR 7
+
+/// The patch number
+#define PROTOZERO_VERSION_PATCH 1
+
+/// The complete version number
+#define PROTOZERO_VERSION_CODE (PROTOZERO_VERSION_MAJOR * 10000 + PROTOZERO_VERSION_MINOR * 100 + PROTOZERO_VERSION_PATCH)
+
+/// Version number as string
+#define PROTOZERO_VERSION_STRING "1.7.1"
+
+#endif // PROTOZERO_VERSION_HPP
diff --git a/include/sharded_node_store.h b/include/sharded_node_store.h
new file mode 100644
index 00000000..836c34ef
--- /dev/null
+++ b/include/sharded_node_store.h
@@ -0,0 +1,32 @@
+#ifndef _SHARDED_NODE_STORE
+#define _SHARDED_NODE_STORE
+
+#include <functional>
+#include <memory>
+#include "node_store.h"
+
+class ShardedNodeStore : public NodeStore {
+public:
+	ShardedNodeStore(std::function<std::shared_ptr<NodeStore>()> createNodeStore);
+	~ShardedNodeStore();
+	void reopen() override;
+	void finalize(size_t threadNum) override;
+	LatpLon at(NodeID i) const override;
+	size_t size() const override;
+	void batchStart() override;
+	void insert(const std::vector<element_t>& elements) override;
+	void clear() override {
+		reopen();
+	}
+
+	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *stores[shard]; }
+	const NodeStore& shard(size_t shard) const override { return *stores[shard]; }
+	size_t shards() const override;
+
+private:
+	std::function<std::shared_ptr<NodeStore>()> createNodeStore;
+	std::vector<std::shared_ptr<NodeStore>> stores;
+};
+
+#endif
diff --git a/include/sharded_way_store.h b/include/sharded_way_store.h
new file mode 100644
index 00000000..40a3d331
--- /dev/null
+++ b/include/sharded_way_store.h
@@ -0,0 +1,35 @@
+#ifndef _SHARDED_WAY_STORE
+#define _SHARDED_WAY_STORE
+
+#include <functional>
+#include <memory>
+#include "way_store.h"
+
+class NodeStore;
+
+class ShardedWayStore : public WayStore {
+public:
+	ShardedWayStore(std::function<std::shared_ptr<WayStore>()> createWayStore, const NodeStore& nodeStore);
+	~ShardedWayStore();
+	void reopen() override;
+	void batchStart() override;
+	std::vector<LatpLon> at(WayID wayid) const override;
+	bool requiresNodes() const override;
+	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void clear() override;
+	std::size_t size() const override;
+	void finalize(unsigned int threadNum) override;
+
+	bool contains(size_t shard, WayID id) const override;
+	WayStore& shard(size_t shard) override;
+	const WayStore& shard(size_t shard) const override;
+	size_t shards() const override;
+	
+private:
+	std::function<std::shared_ptr<WayStore>()> createWayStore;
+	const NodeStore& nodeStore;
+	std::vector<std::shared_ptr<WayStore>> stores;
+};
+
+#endif
diff --git a/include/shared_data.h b/include/shared_data.h
index 23ba9a06..45c6e34b 100644
--- a/include/shared_data.h
+++ b/include/shared_data.h
@@ -7,6 +7,7 @@
 
 #include "rapidjson/document.h"
 
+#include "options_parser.h"
 #include "osm_store.h"
 #include "output_object.h"
 #include "mbtiles.h"
@@ -61,10 +62,6 @@ class LayerDefinition {
 	std::string serialiseToJSON() const;
 };
 
-const int OUTPUT_FILE = 0;
-const int OUTPUT_MBTILES = 1;
-const int OUTPUT_PMTILES = 2;
-
 ///\brief Config read from JSON to control behavior of program
 class Config {
 	
@@ -91,7 +88,7 @@ class SharedData {
 
 public:
 	const class LayerDefinition &layers;
-	int outputMode;
+	OptionsParser::OutputMode outputMode;
 	bool mergeSqlite;
 	MBTiles mbtiles;
 	PMTiles pmtiles;
diff --git a/include/shp_mem_tiles.h b/include/shp_mem_tiles.h
index 267a0090..508921ff 100644
--- a/include/shp_mem_tiles.h
+++ b/include/shp_mem_tiles.h
@@ -11,6 +11,8 @@ class ShpMemTiles : public TileDataSource
 public:
 	ShpMemTiles(size_t threadNum, uint baseZoom);
 
+	std::string name() const override { return "shp"; }
+
 	void CreateNamedLayerIndex(const std::string& layerName);
 
 	// Used in shape file loading
diff --git a/include/sorted_node_store.h b/include/sorted_node_store.h
index 5c156ad3..61fdfad3 100644
--- a/include/sorted_node_store.h
+++ b/include/sorted_node_store.h
@@ -3,6 +3,7 @@
 
 #include "node_store.h"
 #include "mmap_allocator.h"
+#include <atomic>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -65,10 +66,15 @@ class SortedNodeStore : public NodeStore
 	size_t size() const override;
 	void batchStart() override;
 	void insert(const std::vector<element_t>& elements) override;
-	void clear() { 
+	void clear() override {
 		reopen();
 	}
 
+	bool contains(size_t shard, NodeID id) const override;
+	NodeStore& shard(size_t shard) override { return *this; }
+	const NodeStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
+
 private: 
 	// When true, store chunks compressed. Only store compressed if the
 	// chunk is sufficiently large.
@@ -82,6 +88,15 @@ class SortedNodeStore : public NodeStore
 	// multiple threads. They'll get folded into the index during finalize()
 	std::map<NodeID, std::vector<element_t>> orphanage;
 	std::vector<std::vector<element_t>> workerBuffers;
+
+	std::atomic<uint64_t> totalGroups;
+	std::atomic<uint64_t> totalNodes;
+	std::atomic<uint64_t> totalGroupSpace;
+	std::atomic<uint64_t> totalAllocatedSpace;
+	std::atomic<uint64_t> totalChunks;
+	std::atomic<uint64_t> chunkSizeFreqs[257];
+	std::atomic<uint64_t> groupSizeFreqs[257];
+
 	void collectOrphans(const std::vector<element_t>& orphans);
 	void publishGroup(const std::vector<element_t>& nodes);
 };
diff --git a/include/sorted_way_store.h b/include/sorted_way_store.h
index 145e467b..b99ba7de 100644
--- a/include/sorted_way_store.h
+++ b/include/sorted_way_store.h
@@ -1,6 +1,7 @@
 #ifndef _SORTED_WAY_STORE_H
 #define _SORTED_WAY_STORE_H
 
+#include <atomic>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -89,10 +90,15 @@ class SortedWayStore: public WayStore {
 	std::vector<LatpLon> at(WayID wayid) const override;
 	bool requiresNodes() const override { return true; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
-	const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
 	void clear() override;
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
+
+	bool contains(size_t shard, WayID id) const override;
+	WayStore& shard(size_t shard) override { return *this; }
+	const WayStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
 	
 	static uint16_t encodeWay(
 		const std::vector<NodeID>& way,
@@ -113,6 +119,13 @@ class SortedWayStore: public WayStore {
 	// multiple threads. They'll get folded into the index during finalize()
 	std::map<WayID, std::vector<std::pair<WayID, std::vector<NodeID>>>> orphanage;
 	std::vector<std::vector<std::pair<WayID, std::vector<NodeID>>>> workerBuffers;
+
+	std::atomic<uint64_t> totalWays;
+	std::atomic<uint64_t> totalNodes;
+	std::atomic<uint64_t> totalGroups;
+	std::atomic<uint64_t> totalGroupSpace;
+	std::atomic<uint64_t> totalChunks;
+
 	void collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans);
 	void publishGroup(const std::vector<std::pair<WayID, std::vector<NodeID>>>& ways);
 };
diff --git a/include/tag_map.h b/include/tag_map.h
new file mode 100644
index 00000000..f951d9e9
--- /dev/null
+++ b/include/tag_map.h
@@ -0,0 +1,56 @@
+#ifndef _TAG_MAP_H
+#define _TAG_MAP_H
+
+#include <vector>
+#include <string>
+#include <boost/container/flat_map.hpp>
+#include <protozero/data_view.hpp>
+
+// We track tags in a special structure, which enables some tricks when
+// doing Lua interop.
+//
+// The alternative is a std::map - but often, our map is quite small.
+// It's preferable to have a small set of vectors and do linear search.
+//
+// Further, we can avoid passing std::string from Lua -> C++ in some cases
+// by first checking to see if the string we would have passed is already
+// stored in our tag map, and passing a reference to its location.
+
+// Assumptions:
+// 1. Not thread-safe
+//      This is OK because we have 1 instance of OsmLuaProcessing per thread.
+// 2. Lifetime of map is less than lifetime of keys/values that are passed
+//      This is true since the strings are owned by the protobuf block reader
+// 3. Max number of tag values will fit in a short
+//      OSM limit is 5,000 tags per object
+class TagMap {
+public:
+	TagMap();
+	void reset();
+
+	void addTag(const protozero::data_view& key, const protozero::data_view& value);
+
+	// Return -1 if key not found, else return its keyLoc.
+	int64_t getKey(const char* key, size_t size) const;
+
+	// Return -1 if value not found, else return its keyLoc.
+	int64_t getValue(const char* key, size_t size) const;
+
+	const protozero::data_view* getValueFromKey(uint32_t keyLoc) const;
+	const protozero::data_view* getValue(uint32_t valueLoc) const;
+
+	boost::container::flat_map<std::string, std::string> exportToBoostMap() const;
+
+private:
+	uint32_t ensureString(
+		std::vector<std::vector<const protozero::data_view*>>& vector,
+		const protozero::data_view& value
+	);
+
+
+	std::vector<std::vector<const protozero::data_view*>> keys;
+	std::vector<std::vector<uint32_t>> key2value;
+	std::vector<std::vector<const protozero::data_view*>> values;
+};
+
+#endif _TAG_MAP_H
diff --git a/include/tile_data.h b/include/tile_data.h
index 814b53ce..b78463e2 100644
--- a/include/tile_data.h
+++ b/include/tile_data.h
@@ -8,7 +8,11 @@
 #include <memory>
 #include <boost/sort/sort.hpp>
 #include "output_object.h"
+#include "append_vector.h"
 #include "clip_cache.h"
+#include "mmap_allocator.h"
+
+#define TILE_DATA_ID_SIZE 34
 
 typedef std::vector<class TileDataSource *> SourceList;
 
@@ -45,16 +49,40 @@ struct OutputObjectXYID {
 };
 
 template<typename OO> void finalizeObjects(
+	const std::string& name,
 	const size_t& threadNum,
 	const unsigned int& baseZoom,
-	typename std::vector<std::vector<OO>>::iterator begin,
-	typename std::vector<std::vector<OO>>::iterator end
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator begin,
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator end,
+	typename std::vector<std::vector<OO>>& lowZoom
 	) {
-	for (typename std::vector<std::vector<OO>>::iterator it = begin; it != end; it++) {
+	size_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
+#ifdef CLOCK_MONOTONIC
+	timespec startTs, endTs;
+	clock_gettime(CLOCK_MONOTONIC, &startTs);
+#endif
+
+	int i = -1;
+	for (auto it = begin; it != end; it++) {
+		i++;
+		if (it->size() > 0 || i % 10 == 0 || i == 4095) {
+			std::cout << "\r" << name << ": finalizing z6 tile " << (i + 1) << "/" << CLUSTER_ZOOM_AREA;
+
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &endTs);
+			uint64_t elapsedNs = 1e9 * (endTs.tv_sec - startTs.tv_sec) + endTs.tv_nsec - startTs.tv_nsec;
+			std::cout << " (" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)";
+#endif
+			std::cout << std::flush;
+		}
 		if (it->size() == 0)
 			continue;
 
-		it->shrink_to_fit();
+		// We track a separate copy of low zoom objects to avoid scanning large
+		// lists of objects that may be on slow disk storage.
+		for (auto objectIt = it->begin(); objectIt != it->end(); objectIt++)
+			if (objectIt->oo.minZoom < CLUSTER_ZOOM)
+				lowZoom[i].push_back(*objectIt);
 
 		// If the user is doing a a small extract, there are few populated
 		// entries in `object`.
@@ -102,17 +130,18 @@ template<typename OO> void finalizeObjects(
 			},
 			threadNum
 		);
-
 	}
+
+	std::cout << std::endl;
 }
 
 template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 	const unsigned int& baseZoom,
-	const typename std::vector<std::vector<OO>>::iterator objects,
+	const typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
 	const size_t size,
-	const unsigned int zoom,
-	TileCoordinatesSet& output
+	std::vector<TileCoordinatesSet>& zooms
 ) {
+	size_t maxZoom = zooms.size() - 1;
 	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 	int64_t lastX = -1;
 	int64_t lastY = -1;
@@ -126,13 +155,18 @@ template<typename OO> void collectTilesWithObjectsAtZoomTemplate(
 			TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
 
 			// Translate the x, y at the requested zoom level
-			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+			TileCoordinate x = baseX / (1 << (baseZoom - maxZoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - maxZoom));
 
 			if (lastX != x || lastY != y) {
-				output.set(x, y);
 				lastX = x;
 				lastY = y;
+
+				for (int zoom = maxZoom; zoom >= 0; zoom--) {
+					zooms[zoom].set(x, y);
+					x /= 2;
+					y /= 2;
+				}
 			}
 		}
 	}
@@ -148,107 +182,124 @@ inline OutputObjectID outputObjectWithId<OutputObjectXYID>(const OutputObjectXYI
 	return OutputObjectID({ input.oo, input.id });
 }
 
+template<typename OO> void collectLowZoomObjectsForTile(
+	const unsigned int& baseZoom,
+	typename std::vector<std::vector<OO>> objects,
+	unsigned int zoom,
+	const TileCoordinates& dstIndex,
+	std::vector<OutputObjectID>& output
+) {
+	if (zoom >= CLUSTER_ZOOM)
+		throw std::runtime_error("collectLowZoomObjectsForTile should not be called for high zooms");
+
+	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
+
+	for (size_t i = 0; i < objects.size(); i++) {
+		const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
+		const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
+
+		for (size_t j = 0; j < objects[i].size(); j++) {
+			// Compute the x, y at the base zoom level
+			TileCoordinate baseX = z6x * z6OffsetDivisor + objects[i][j].x;
+			TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
+
+			// Translate the x, y at the requested zoom level
+			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+
+			if (dstIndex.x == x && dstIndex.y == y) {
+				if (objects[i][j].oo.minZoom <= zoom) {
+					output.push_back(outputObjectWithId(objects[i][j]));
+				}
+			}
+		}
+	}
+}
+
 template<typename OO> void collectObjectsForTileTemplate(
 	const unsigned int& baseZoom,
-	typename std::vector<std::vector<OO>>::iterator objects,
+	typename std::vector<AppendVectorNS::AppendVector<OO>>::iterator objects,
 	size_t iStart,
 	size_t iEnd,
 	unsigned int zoom,
 	const TileCoordinates& dstIndex,
 	std::vector<OutputObjectID>& output
 ) {
+	if (zoom < CLUSTER_ZOOM)
+		throw std::runtime_error("collectObjectsForTileTemplate should not be called for low zooms");
+
 	uint16_t z6OffsetDivisor = baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1;
 
 	for (size_t i = iStart; i < iEnd; i++) {
-		const size_t z6x = i / CLUSTER_ZOOM_WIDTH;
-		const size_t z6y = i % CLUSTER_ZOOM_WIDTH;
+		// If z >= 6, we can compute the exact bounds within the objects array.
+		// Translate to the base zoom, then do a binary search to find
+		// the starting point.
+		TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
+		TileCoordinate z6y = dstIndex.y / (1 << (zoom - CLUSTER_ZOOM));
+
+		TileCoordinate baseX = dstIndex.x * (1 << (baseZoom - zoom));
+		TileCoordinate baseY = dstIndex.y * (1 << (baseZoom - zoom));
+
+		Z6Offset needleX = baseX - z6x * z6OffsetDivisor;
+		Z6Offset needleY = baseY - z6y * z6OffsetDivisor;
+
+		// Kind of gross that we have to do this. Might be better if we split
+		// into two arrays, one of x/y and one of OOs. Would have better locality for
+		// searching, too.
+		OutputObject dummyOo(POINT_, 0, 0, 0, 0);
+		const size_t bz = baseZoom;
+
+		const OO targetXY = {dummyOo, needleX, needleY };
+		auto iter = std::lower_bound(
+			objects[i].begin(),
+			objects[i].end(),
+			targetXY,
+			[bz](const OO& a, const OO& b) {
+				// Cluster by parent zoom, so that a subsequent search
+				// can find a contiguous range of entries for any tile
+				// at zoom 6 or higher.
+				const size_t aX = a.x;
+				const size_t aY = a.y;
+				const size_t bX = b.x;
+				const size_t bY = b.y;
+				for (size_t z = CLUSTER_ZOOM; z <= bz; z++) {
+					const auto aXz = aX / (1 << (bz - z));
+					const auto aYz = aY / (1 << (bz - z));
+					const auto bXz = bX / (1 << (bz - z));
+					const auto bYz = bY / (1 << (bz - z));
 
-		if (zoom >= CLUSTER_ZOOM) {
-			// If z >= 6, we can compute the exact bounds within the objects array.
-			// Translate to the base zoom, then do a binary search to find
-			// the starting point.
-			TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
-			TileCoordinate z6y = dstIndex.y / (1 << (zoom - CLUSTER_ZOOM));
-
-			TileCoordinate baseX = dstIndex.x * (1 << (baseZoom - zoom));
-			TileCoordinate baseY = dstIndex.y * (1 << (baseZoom - zoom));
-
-			Z6Offset needleX = baseX - z6x * z6OffsetDivisor;
-			Z6Offset needleY = baseY - z6y * z6OffsetDivisor;
-
-			// Kind of gross that we have to do this. Might be better if we split
-			// into two arrays, one of x/y and one of OOs. Would have better locality for
-			// searching, too.
-			OutputObject dummyOo(POINT_, 0, 0, 0, 0);
-			const size_t bz = baseZoom;
-
-			const OO targetXY = {dummyOo, needleX, needleY };
-			auto iter = std::lower_bound(
-				objects[i].begin(),
-				objects[i].end(),
-				targetXY,
-				[bz](const OO& a, const OO& b) {
-					// Cluster by parent zoom, so that a subsequent search
-					// can find a contiguous range of entries for any tile
-					// at zoom 6 or higher.
-					const size_t aX = a.x;
-					const size_t aY = a.y;
-					const size_t bX = b.x;
-					const size_t bY = b.y;
-					for (size_t z = CLUSTER_ZOOM; z <= bz; z++) {
-						const auto aXz = aX / (1 << (bz - z));
-						const auto aYz = aY / (1 << (bz - z));
-						const auto bXz = bX / (1 << (bz - z));
-						const auto bYz = bY / (1 << (bz - z));
-
-						if (aXz != bXz)
-							return aXz < bXz;
-
-						if (aYz != bYz)
-							return aYz < bYz;
-					}
-					return false;
-				}
-			);
-			for (; iter != objects[i].end(); iter++) {
-				// Compute the x, y at the base zoom level
-				TileCoordinate baseX = z6x * z6OffsetDivisor + iter->x;
-				TileCoordinate baseY = z6y * z6OffsetDivisor + iter->y;
-
-				// Translate the x, y at the requested zoom level
-				TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-				TileCoordinate y = baseY / (1 << (baseZoom - zoom));
-
-				if (dstIndex.x == x && dstIndex.y == y) {
-					if (iter->oo.minZoom <= zoom) {
-						output.push_back(outputObjectWithId(*iter));
-					}
-				} else {
-					// Short-circuit when we're confident we'd no longer see relevant matches.
-					// We've ordered the entries in `objects` such that all objects that
-					// share the same tile at any zoom are in contiguous runs.
-					//
-					// Thus, as soon as we fail to find a match, we can stop looking.
-					break;
-				}
+					if (aXz != bXz)
+						return aXz < bXz;
 
+					if (aYz != bYz)
+						return aYz < bYz;
+				}
+				return false;
 			}
-		} else {
-			for (size_t j = 0; j < objects[i].size(); j++) {
-				// Compute the x, y at the base zoom level
-				TileCoordinate baseX = z6x * z6OffsetDivisor + objects[i][j].x;
-				TileCoordinate baseY = z6y * z6OffsetDivisor + objects[i][j].y;
-
-				// Translate the x, y at the requested zoom level
-				TileCoordinate x = baseX / (1 << (baseZoom - zoom));
-				TileCoordinate y = baseY / (1 << (baseZoom - zoom));
-
-				if (dstIndex.x == x && dstIndex.y == y) {
-					if (objects[i][j].oo.minZoom <= zoom) {
-						output.push_back(outputObjectWithId(objects[i][j]));
-					}
+		);
+
+		for (; iter != objects[i].end(); iter++) {
+			// Compute the x, y at the base zoom level
+			TileCoordinate baseX = z6x * z6OffsetDivisor + iter->x;
+			TileCoordinate baseY = z6y * z6OffsetDivisor + iter->y;
+
+			// Translate the x, y at the requested zoom level
+			TileCoordinate x = baseX / (1 << (baseZoom - zoom));
+			TileCoordinate y = baseY / (1 << (baseZoom - zoom));
+
+			if (dstIndex.x == x && dstIndex.y == y) {
+				if (iter->oo.minZoom <= zoom) {
+					output.push_back(outputObjectWithId(*iter));
 				}
+			} else {
+				// Short-circuit when we're confident we'd no longer see relevant matches.
+				// We've ordered the entries in `objects` such that all objects that
+				// share the same tile at any zoom are in contiguous runs.
+				//
+				// Thus, as soon as we fail to find a match, we can stop looking.
+				break;
 			}
+
 		}
 	}
 }
@@ -275,6 +326,7 @@ class TileDataSource {
 	std::vector<std::pair<size_t, multi_linestring_store_t*>> availableMultiLinestringStoreLeases;
 	std::vector<std::pair<size_t, multi_polygon_store_t*>> availableMultiPolygonStoreLeases;
 
+	virtual std::string name() const = 0;
 
 protected:	
 	size_t numShards;
@@ -292,8 +344,10 @@ class TileDataSource {
 	//
 	// If config.include_ids is true, objectsWithIds will be populated.
 	// Otherwise, objects.
-	std::vector<std::vector<OutputObjectXY>> objects;
-	std::vector<std::vector<OutputObjectXYID>> objectsWithIds;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXY>> objects;
+	std::vector<std::vector<OutputObjectXY>> lowZoomObjects;
+	std::vector<AppendVectorNS::AppendVector<OutputObjectXYID>> objectsWithIds;
+	std::vector<std::vector<OutputObjectXYID>> lowZoomObjectsWithIds;
 	
 	// rtree index of large objects
 	using oo_rtree_param_type = boost::geometry::index::quadratic<128>;
@@ -310,12 +364,14 @@ class TileDataSource {
 	ClipCache<MultiPolygon> multiPolygonClipCache;
 	ClipCache<MultiLinestring> multiLinestringClipCache;
 
+	std::deque<std::vector<std::tuple<TileCoordinates, OutputObject, uint64_t>>> pendingSmallIndexObjects;
+
 public:
 	TileDataSource(size_t threadNum, unsigned int baseZoom, bool includeID);
 
-	void collectTilesWithObjectsAtZoom(uint zoom, TileCoordinatesSet& output);
+	void collectTilesWithObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms);
 
-	void collectTilesWithLargeObjectsAtZoom(uint zoom, TileCoordinatesSet& output);
+	void collectTilesWithLargeObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms);
 
 	void collectObjectsForTile(uint zoom, TileCoordinates dstIndex, std::vector<OutputObjectID>& output);
 	void finalize(size_t threadNum);
@@ -337,6 +393,8 @@ class TileDataSource {
 	);
 
 	void addObjectToSmallIndex(const TileCoordinates& index, const OutputObject& oo, uint64_t id);
+	void addObjectToSmallIndex(const TileCoordinates& index, const OutputObject& oo, uint64_t id, bool needsLock);
+	void addObjectToSmallIndexUnsafe(const TileCoordinates& index, const OutputObject& oo, uint64_t id);
 
 	void addObjectToLargeIndex(const Box& envelope, const OutputObject& oo, uint64_t id) {
 		std::lock_guard<std::mutex> lock(mutex);
@@ -355,7 +413,7 @@ class TileDataSource {
 	);
 
 	virtual Geometry buildWayGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox);
-	LatpLon buildNodeGeometry(OutputGeometryType const geomType, NodeID const objectID, const TileBbox &bbox) const;
+	virtual LatpLon buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const;
 
 	void open() {
 		// Put something at index 0 of all stores so that 0 can be used
@@ -373,18 +431,18 @@ class TileDataSource {
 	NodeID storePoint(Point const &input);
 
 	inline size_t getShard(NodeID id) const {
-		// Note: we only allocate 35 bits for the IDs. This allows us to
-		// use bit 36 for TileDataSource-specific handling (e.g.,
+		// Note: we only allocate 34 bits for the IDs. This allows us to
+		// use bits 35 and 36 for TileDataSource-specific handling (e.g.,
 		// OsmMemTiles may want to generate points/ways on the fly by
 		// referring to the WayStore).
 
-		return id >> (35 - shardBits);
+		return id >> (TILE_DATA_ID_SIZE - shardBits);
 	}
 
 	virtual void populateMultiPolygon(MultiPolygon& dst, NodeID objectID);
 
 	inline size_t getId(NodeID id) const {
-		return id & (~(~0ull << (35 - shardBits)));
+		return id & (~(~0ull << (TILE_DATA_ID_SIZE - shardBits)));
 	}
 
 	const Point& retrievePoint(NodeID id) const {
@@ -426,9 +484,9 @@ class TileDataSource {
 	}
 };
 
-TileCoordinatesSet getTilesAtZoom(
+void populateTilesAtZoom(
 	const std::vector<class TileDataSource *>& sources,
-	unsigned int zoom
+	std::vector<TileCoordinatesSet>& zooms
 );
 
 #endif //_TILE_DATA_H
diff --git a/include/way_store.h b/include/way_store.h
index 8650cbea..36862344 100644
--- a/include/way_store.h
+++ b/include/way_store.h
@@ -17,10 +17,15 @@ class WayStore {
 	virtual std::vector<LatpLon> at(WayID wayid) const = 0;
 	virtual bool requiresNodes() const = 0;
 	virtual void insertLatpLons(std::vector<ll_element_t>& newWays) = 0;
-	virtual const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) = 0;
+	virtual void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) = 0;
 	virtual void clear() = 0;
 	virtual std::size_t size() const = 0;
 	virtual void finalize(unsigned int threadNum) = 0;
+
+	virtual bool contains(size_t shard, WayID id) const = 0;
+	virtual WayStore& shard(size_t shard) = 0;
+	virtual const WayStore& shard(size_t shard) const = 0;
+	virtual size_t shards() const = 0;
 };
 
 #endif
diff --git a/include/way_stores.h b/include/way_stores.h
index dfb5f74c..0f94e845 100644
--- a/include/way_stores.h
+++ b/include/way_stores.h
@@ -5,6 +5,7 @@
 #include <mutex>
 #include "way_store.h"
 #include "sorted_way_store.h"
+#include "sharded_way_store.h"
 
 class BinarySearchWayStore: public WayStore {
 
@@ -16,11 +17,16 @@ class BinarySearchWayStore: public WayStore {
 	std::vector<LatpLon> at(WayID wayid) const override;
 	bool requiresNodes() const override { return false; }
 	void insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) override;
-	const void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
+	void insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) override;
 	void clear() override;
 	std::size_t size() const override;
 	void finalize(unsigned int threadNum) override;
 
+	bool contains(size_t shard, WayID id) const override;
+	WayStore& shard(size_t shard) override { return *this; }
+	const WayStore& shard(size_t shard) const override { return *this; }
+	size_t shards() const override { return 1; }
+
 private:
 	mutable std::mutex mutex;
 	std::unique_ptr<map_t> mLatpLonLists;
diff --git a/include/write_geometry.h b/include/write_geometry.h
index 8d1d014b..985b7b66 100644
--- a/include/write_geometry.h
+++ b/include/write_geometry.h
@@ -9,7 +9,6 @@
 #include "coordinates_geom.h"
 
 // Protobuf
-#include "osmformat.pb.h"
 #include "vector_tile.pb.h"
 
 typedef std::vector<std::pair<int,int> > XYString;
diff --git a/resources/process-coastline.lua b/resources/process-coastline.lua
index 5e2aca8e..b49eeee5 100644
--- a/resources/process-coastline.lua
+++ b/resources/process-coastline.lua
@@ -10,10 +10,10 @@ function exit_function()
 end
 
 node_keys = {}
-function node_function(node)
+function node_function()
 end
 
-function way_function(way)
+function way_function()
 end
 
 -- Remap coastlines
diff --git a/resources/process-debug.lua b/resources/process-debug.lua
index ea594c19..e1c8e62f 100644
--- a/resources/process-debug.lua
+++ b/resources/process-debug.lua
@@ -45,36 +45,36 @@ aerodromeValues = Set { "international", "public", "regional", "military", "priv
 -- Process node tags
 
 node_keys = { "amenity", "shop", "sport", "tourism", "place", "office", "natural", "addr:housenumber", "aeroway" }
-function node_function(node)
+function node_function()
 	-- Write 'aerodrome_label'
-	local aeroway = node:Find("aeroway")
+	local aeroway = Find("aeroway")
 	if aeroway == "aerodrome" then
-		node:Layer("aerodrome_label", false)
-		SetNameAttributes(node)
-		node:Attribute("iata", node:Find("iata"))
-		SetEleAttributes(node)
-		node:Attribute("icao", node:Find("icao"))
+		Layer("aerodrome_label", false)
+		SetNameAttributes()
+		Attribute("iata", Find("iata"))
+		SetEleAttributes()
+		Attribute("icao", Find("icao"))
 
-		local aerodrome_value = node:Find("aerodrome")
+		local aerodrome_value = Find("aerodrome")
 		local class
 		if aerodromeValues[aerodrome_value] then class = aerodrome_value else class = "other" end
-		node:Attribute("class", class)
+		Attribute("class", class)
 	end
 	-- Write 'housenumber'
-	local housenumber = node:Find("addr:housenumber")
+	local housenumber = Find("addr:housenumber")
 	if housenumber~="" then
-		node:Layer("housenumber", false)
-		node:Attribute("housenumber", housenumber)
+		Layer("housenumber", false)
+		Attribute("housenumber", housenumber)
 	end
 
 	-- Write 'place'
 	-- note that OpenMapTiles has a rank for countries (1-3), states (1-6) and cities (1-10+);
 	--   we could potentially approximate it for cities based on the population tag
-	local place = node:Find("place")
+	local place = Find("place")
 	if place ~= "" then
 		local rank = nil
 		local mz = 13
-		local pop = tonumber(node:Find("population")) or 0
+		local pop = tonumber(Find("population")) or 0
 
 		if     place == "continent"     then mz=2
 		elseif place == "country"       then mz=3; rank=1
@@ -90,31 +90,31 @@ function node_function(node)
 		elseif place == "locality"      then mz=13
 		end
 
-		node:Layer("place", false)
-		node:Attribute("class", place)
-		node:MinZoom(mz)
-		if rank then node:AttributeNumeric("rank", rank) end
-		SetNameAttributes(node)
+		Layer("place", false)
+		Attribute("class", place)
+		MinZoom(mz)
+		if rank then AttributeNumeric("rank", rank) end
+		SetNameAttributes()
 		return
 	end
 
 	-- Write 'poi'
-	local rank, class, subclass = GetPOIRank(node)
+	local rank, class, subclass = GetPOIRank()
 	if rank then WritePOI(node,class,subclass,rank) end
 
 	-- Write 'mountain_peak' and 'water_name'
-	local natural = node:Find("natural")
+	local natural = Find("natural")
 	if natural == "peak" or natural == "volcano" then
-		node:Layer("mountain_peak", false)
-		SetEleAttributes(node)
-		node:AttributeNumeric("rank", 1)
-		node:Attribute("class", natural)
-		SetNameAttributes(node)
+		Layer("mountain_peak", false)
+		SetEleAttributes()
+		AttributeNumeric("rank", 1)
+		Attribute("class", natural)
+		SetNameAttributes()
 		return
 	end
 	if natural == "bay" then
-		node:Layer("water_name", false)
-		SetNameAttributes(node)
+		Layer("water_name", false)
+		SetNameAttributes()
 		return
 	end
 end
@@ -196,33 +196,33 @@ waterClasses    = Set { "river", "riverbank", "stream", "canal", "drain", "ditch
 waterwayClasses = Set { "stream", "river", "canal", "drain", "ditch" }
 
 
-function way_function(way)
-	local highway  = way:Find("highway")
-	local waterway = way:Find("waterway")
-	local water    = way:Find("water")
-	local building = way:Find("building")
-	local natural  = way:Find("natural")
-	local historic = way:Find("historic")
-	local landuse  = way:Find("landuse")
-	local leisure  = way:Find("leisure")
-	local amenity  = way:Find("amenity")
-	local aeroway  = way:Find("aeroway")
-	local railway  = way:Find("railway")
-	local sport    = way:Find("sport")
-	local shop     = way:Find("shop")
-	local tourism  = way:Find("tourism")
-	local man_made = way:Find("man_made")
-	local isClosed = way:IsClosed()
-	local housenumber = way:Find("addr:housenumber")
+function way_function()
+	local highway  = Find("highway")
+	local waterway = Find("waterway")
+	local water    = Find("water")
+	local building = Find("building")
+	local natural  = Find("natural")
+	local historic = Find("historic")
+	local landuse  = Find("landuse")
+	local leisure  = Find("leisure")
+	local amenity  = Find("amenity")
+	local aeroway  = Find("aeroway")
+	local railway  = Find("railway")
+	local sport    = Find("sport")
+	local shop     = Find("shop")
+	local tourism  = Find("tourism")
+	local man_made = Find("man_made")
+	local isClosed = IsClosed()
+	local housenumber = Find("addr:housenumber")
 	local write_name = false
-	local construction = way:Find("construction")
+	local construction = Find("construction")
 
 	-- Miscellaneous preprocessing
-	if way:Find("disused") == "yes" then return end
+	if Find("disused") == "yes" then return end
 	if highway == "proposed" then return end
 	if aerowayBuildings[aeroway] then building="yes"; aeroway="" end
 	if landuse == "field" then landuse = "farmland" end
-	if landuse == "meadow" and way:Find("meadow")=="agricultural" then landuse="farmland" end
+	if landuse == "meadow" and Find("meadow")=="agricultural" then landuse="farmland" end
 
 	-- Roads ('transportation' and 'transportation_name', plus 'transportation_name_detail')
 	if highway~="" then
@@ -235,33 +235,33 @@ function way_function(way)
 		if trackValues[highway]     then h = "track"; layer="transportation_detail" end
 		if pathValues[highway]      then h = "path" ; layer="transportation_detail" end
 		if h=="service"             then              layer="transportation_detail" end
-		way:Layer(layer, false)
-		way:Attribute("class", h)
-		SetBrunnelAttributes(way)
+		Layer(layer, false)
+		Attribute("class", h)
+		SetBrunnelAttributes()
 
 		-- Construction
 		if highway == "construction" then
 			if constructionValues[construction] then
-				way:Attribute("class", construction .. "_construction")
+				Attribute("class", construction .. "_construction")
 			else
-				way:Attribute("class", "minor_construction")
+				Attribute("class", "minor_construction")
 			end
 		end
 
 		-- Service
-		local service = way:Find("service")
-		if highway == "service" and service ~="" then way:Attribute("service", service) end
+		local service = Find("service")
+		if highway == "service" and service ~="" then Attribute("service", service) end
 
 		-- Links (ramp)
 		if linkValues[highway] then
 			splitHighway = split(highway, "_")
 			highway = splitHighway[1]
-			way:AttributeNumeric("ramp",1)
+			AttributeNumeric("ramp",1)
 		end
 
-		local oneway = way:Find("oneway")
+		local oneway = Find("oneway")
 		if oneway == "yes" or oneway == "1" then
-			way:AttributeNumeric("oneway",1)
+			AttributeNumeric("oneway",1)
 		end
 		if oneway == "-1" then
 			-- **** TODO
@@ -269,115 +269,115 @@ function way_function(way)
 
 		-- Write names
 		if layer == "motorway" or layer == "trunk" then
-			way:Layer("transportation_name", false)
+			Layer("transportation_name", false)
 		elseif h == "minor" or h == "track" or h == "path" or h == "service" then
-			way:Layer("transportation_name_detail", false)
+			Layer("transportation_name_detail", false)
 		else
-			way:Layer("transportation_name_mid", false)
+			Layer("transportation_name_mid", false)
 		end
-		SetNameAttributes(way)
-		way:Attribute("class",h)
-		way:Attribute("network","road") -- **** needs fixing
-		if h~=highway then way:Attribute("subclass",highway) end
-		local ref = way:Find("ref")
+		SetNameAttributes()
+		Attribute("class",h)
+		Attribute("network","road") -- **** needs fixing
+		if h~=highway then Attribute("subclass",highway) end
+		local ref = Find("ref")
 		if ref~="" then
-			way:Attribute("ref",ref)
-			way:AttributeNumeric("ref_length",ref:len())
+			Attribute("ref",ref)
+			AttributeNumeric("ref_length",ref:len())
 		end
 	end
 
 	-- Railways ('transportation' and 'transportation_name', plus 'transportation_name_detail')
 	if railway~="" then
-		way:Layer("transportation", false)
-		way:Attribute("class", railway)
+		Layer("transportation", false)
+		Attribute("class", railway)
 
-		way:Layer("transportation_name", false)
-		SetNameAttributes(way)
-		way:MinZoom(14)
-		way:Attribute("class", "rail")
+		Layer("transportation_name", false)
+		SetNameAttributes()
+		MinZoom(14)
+		Attribute("class", "rail")
 	end
 
 	-- 'Aeroway'
 	if aeroway~="" then
-		way:Layer("aeroway", isClosed)
-		way:Attribute("class",aeroway)
-		way:Attribute("ref",way:Find("ref"))
+		Layer("aeroway", isClosed)
+		Attribute("class",aeroway)
+		Attribute("ref",Find("ref"))
 		write_name = true
 	end
 
 	-- 'aerodrome_label'
 	if aeroway=="aerodrome" then
-	 	way:LayerAsCentroid("aerodrome_label")
-	 	SetNameAttributes(way)
-	 	way:Attribute("iata", way:Find("iata"))
-  		SetEleAttributes(way)
- 	 	way:Attribute("icao", way:Find("icao"))
+	 	LayerAsCentroid("aerodrome_label")
+	 	SetNameAttributes()
+	 	Attribute("iata", Find("iata"))
+  		SetEleAttributes()
+ 	 	Attribute("icao", Find("icao"))
 
- 	 	local aerodrome = way:Find(aeroway)
+ 	 	local aerodrome = Find(aeroway)
  	 	local class
  	 	if aerodromeValues[aerodrome] then class = aerodrome else class = "other" end
- 	 	way:Attribute("class", class)
+ 	 	Attribute("class", class)
 	end
 
 	-- Set 'waterway' and associated
 	if waterwayClasses[waterway] and not isClosed then
-		if waterway == "river" and way:Holds("name") then
-			way:Layer("waterway", false)
+		if waterway == "river" and Holds("name") then
+			Layer("waterway", false)
 		else
-			way:Layer("waterway_detail", false)
+			Layer("waterway_detail", false)
 		end
-		if way:Find("intermittent")=="yes" then way:AttributeNumeric("intermittent", 1) else way:AttributeNumeric("intermittent", 0) end
-		way:Attribute("class", waterway)
-		SetNameAttributes(way)
-		SetBrunnelAttributes(way)
-	elseif waterway == "boatyard"  then way:Layer("landuse", isClosed); way:Attribute("class", "industrial")
-	elseif waterway == "dam"       then way:Layer("building",isClosed)
-	elseif waterway == "fuel"      then way:Layer("landuse", isClosed); way:Attribute("class", "industrial")
+		if Find("intermittent")=="yes" then AttributeNumeric("intermittent", 1) else AttributeNumeric("intermittent", 0) end
+		Attribute("class", waterway)
+		SetNameAttributes()
+		SetBrunnelAttributes()
+	elseif waterway == "boatyard"  then Layer("landuse", isClosed); Attribute("class", "industrial")
+	elseif waterway == "dam"       then Layer("building",isClosed)
+	elseif waterway == "fuel"      then Layer("landuse", isClosed); Attribute("class", "industrial")
 	end
 	-- Set names on rivers
 	if waterwayClasses[waterway] and not isClosed then
-		if waterway == "river" and way:Holds("name") then
-			way:Layer("water_name", false)
+		if waterway == "river" and Holds("name") then
+			Layer("water_name", false)
 		else
-			way:Layer("water_name_detail", false)
-			way:MinZoom(14)
+			Layer("water_name_detail", false)
+			MinZoom(14)
 		end
-		way:Attribute("class", waterway)
-		SetNameAttributes(way)
+		Attribute("class", waterway)
+		SetNameAttributes()
 	end
 
 	-- Set 'building' and associated
 	if building~="" then
-		way:Layer("building", true)
-		SetMinZoomByArea(way)
+		Layer("building", true)
+		SetMinZoomByArea()
 	end
 
 	-- Set 'housenumber'
 	if housenumber~="" then
-		way:LayerAsCentroid("housenumber", false)
-		way:Attribute("housenumber", housenumber)
+		LayerAsCentroid("housenumber", false)
+		Attribute("housenumber", housenumber)
 	end
 
 	-- Set 'water'
 	if natural=="water" or natural=="bay" or leisure=="swimming_pool" or landuse=="reservoir" or landuse=="basin" or waterClasses[waterway] then
-		if way:Find("covered")=="yes" or not isClosed then return end
+		if Find("covered")=="yes" or not isClosed then return end
 		local class="lake"; if natural=="bay" then class="ocean" elseif waterway~="" then class="river" end
-		way:Layer("water",true)
---		SetMinZoomByArea(way)
-		way:Attribute("class",class)
+		Layer("water",true)
+--		SetMinZoomByArea()
+		Attribute("class",class)
 
-		if way:Find("intermittent")=="yes" then way:Attribute("intermittent",1) end
+		if Find("intermittent")=="yes" then Attribute("intermittent",1) end
 		-- we only want to show the names of actual lakes not every man-made basin that probably doesn't even have a name other than "basin"
 		-- examples for which we don't want to show a name:
 		--  https://www.openstreetmap.org/way/25958687
 		--  https://www.openstreetmap.org/way/27201902
 		--  https://www.openstreetmap.org/way/25309134
 		--  https://www.openstreetmap.org/way/24579306
-		if way:Holds("name") and natural=="water" and water ~= "basin" and water ~= "wastewater" then
-			way:LayerAsCentroid("water_name_detail")
-			SetNameAttributes(way)
---			SetMinZoomByArea(way)
-			way:Attribute("class", class)
+		if Holds("name") and natural=="water" and water ~= "basin" and water ~= "wastewater" then
+			LayerAsCentroid("water_name_detail")
+			SetNameAttributes()
+--			SetMinZoomByArea()
+			Attribute("class", class)
 		end
 
 		return -- in case we get any landuse processing
@@ -388,11 +388,11 @@ function way_function(way)
 	if l=="" then l=natural end
 	if l=="" then l=leisure end
 	if landcoverKeys[l] then
-		way:Layer("landcover", true)
-		SetMinZoomByArea(way)
-		way:Attribute("class", landcoverKeys[l])
-		if l=="wetland" then way:Attribute("subclass", way:Find("wetland"))
-		else way:Attribute("subclass", l) end
+		Layer("landcover", true)
+		SetMinZoomByArea()
+		Attribute("class", landcoverKeys[l])
+		if l=="wetland" then Attribute("subclass", Find("wetland"))
+		else Attribute("subclass", l) end
 		write_name = true
 
 	-- Set 'landuse'
@@ -400,26 +400,26 @@ function way_function(way)
 		if l=="" then l=amenity end
 		if l=="" then l=tourism end
 		if landuseKeys[l] then
-			way:Layer("landuse", true)
-			way:Attribute("class", l)
+			Layer("landuse", true)
+			Attribute("class", l)
 			write_name = true
 		end
 	end
 
 	-- Parks
-	if     boundary=="national_park" then way:Layer("park",true); way:Attribute("class",boundary); SetNameAttributes(way)
-	elseif leisure=="nature_reserve" then way:Layer("park",true); way:Attribute("class",leisure ); SetNameAttributes(way) end
+	if     boundary=="national_park" then Layer("park",true); Attribute("class",boundary); SetNameAttributes()
+	elseif leisure=="nature_reserve" then Layer("park",true); Attribute("class",leisure ); SetNameAttributes() end
 
 	-- POIs ('poi' and 'poi_detail')
-	local rank, class, subclass = GetPOIRank(way)
+	local rank, class, subclass = GetPOIRank()
 	if rank then WritePOI(way,class,subclass,rank); return end
 
 	-- Catch-all
-	if (building~="" or write_name) and way:Holds("name") then
-		way:LayerAsCentroid("poi_detail")
-		SetNameAttributes(way)
+	if (building~="" or write_name) and Holds("name") then
+		LayerAsCentroid("poi_detail")
+		SetNameAttributes()
 		if write_name then rank=6 else rank=25 end
-		way:AttributeNumeric("rank", rank)
+		AttributeNumeric("rank", rank)
 	end
 end
 
@@ -435,65 +435,67 @@ end
 function WritePOI(obj,class,subclass,rank)
 	local layer = "poi"
 	if rank>4 then layer="poi_detail" end
-	obj:LayerAsCentroid(layer)
+	LayerAsCentroid(layer)
 	SetNameAttributes(obj)
-	obj:AttributeNumeric("rank", rank)
-	obj:Attribute("class", class)
-	obj:Attribute("subclass", subclass)
+	AttributeNumeric("rank", rank)
+	Attribute("class", class)
+	Attribute("subclass", subclass)
 end
 
 -- Set name attributes on any object
 function SetNameAttributes(obj)
-	local name = obj:Find("name"), main_written = name, iname
+	local name = Find("name")
+	local main_written = name
+	local iname
 	-- if we have a preferred language, then write that (if available), and additionally write the base name tag
-	if preferred_language and obj:Holds("name:"..preferred_language) then 
-		iname = obj:Find("name:"..preferred_language)
+	if preferred_language and Holds("name:"..preferred_language) then 
+		iname = Find("name:"..preferred_language)
 print("Found "..preferred_language..": "..iname)
-		obj:Attribute(preferred_language_attribute, iname)
+		Attribute(preferred_language_attribute, iname)
 		if iname~=name and default_language_attribute then
-			obj:Attribute(default_language_attribute, name)
+			Attribute(default_language_attribute, name)
 		else main_written = iname end
 	else
-		obj:Attribute(preferred_language_attribute, name)
+		Attribute(preferred_language_attribute, name)
 	end
 	-- then set any additional languages
 	for i,lang in ipairs(additional_languages) do
-		iname = obj:Find("name:"..lang)
+		iname = Find("name:"..lang)
 		if iname=="" then iname=name end
-		if iname~=main_written then obj:Attribute("name:"..lang, iname) end
+		if iname~=main_written then Attribute("name:"..lang, iname) end
 	end
 end
 
 -- Set ele and ele_ft on any object
 function SetEleAttributes(obj)
-    local ele = obj:Find("ele")
+    local ele = Find("ele")
 	if ele ~= "" then
 		local meter = math.floor(tonumber(ele) or 0)
 		local feet = math.floor(meter * 3.2808399)
-		obj:AttributeNumeric("ele", meter)
-		obj:AttributeNumeric("ele_ft", feet)
+		AttributeNumeric("ele", meter)
+		AttributeNumeric("ele_ft", feet)
     end
 end
 
 function SetBrunnelAttributes(obj)
-	if     obj:Find("bridge") == "yes" then obj:Attribute("brunnel", "bridge")
-	elseif obj:Find("tunnel") == "yes" then obj:Attribute("brunnel", "tunnel")
-	elseif obj:Find("ford")   == "yes" then obj:Attribute("brunnel", "ford")
+	if     Find("bridge") == "yes" then Attribute("brunnel", "bridge")
+	elseif Find("tunnel") == "yes" then Attribute("brunnel", "tunnel")
+	elseif Find("ford")   == "yes" then Attribute("brunnel", "ford")
 	end
 end
 
 -- Set minimum zoom level by area
-function SetMinZoomByArea(way)
-	local area=way:Area()
-	if     area>ZRES5^2  then way:MinZoom(6)
-	elseif area>ZRES6^2  then way:MinZoom(7)
-	elseif area>ZRES7^2  then way:MinZoom(8)
-	elseif area>ZRES8^2  then way:MinZoom(9)
-	elseif area>ZRES9^2  then way:MinZoom(10)
-	elseif area>ZRES10^2 then way:MinZoom(11)
-	elseif area>ZRES11^2 then way:MinZoom(12)
-	elseif area>ZRES12^2 then way:MinZoom(13)
-	else                      way:MinZoom(14) end
+function SetMinZoomByArea()
+	local area=Area()
+	if     area>ZRES5^2  then MinZoom(6)
+	elseif area>ZRES6^2  then MinZoom(7)
+	elseif area>ZRES7^2  then MinZoom(8)
+	elseif area>ZRES8^2  then MinZoom(9)
+	elseif area>ZRES9^2  then MinZoom(10)
+	elseif area>ZRES10^2 then MinZoom(11)
+	elseif area>ZRES11^2 then MinZoom(12)
+	elseif area>ZRES12^2 then MinZoom(13)
+	else                      MinZoom(14) end
 end
 
 -- Calculate POIs (typically rank 1-4 go to 'poi' z12-14, rank 5+ to 'poi_detail' z14)
@@ -503,8 +505,8 @@ function GetPOIRank(obj)
 
 	-- Can we find the tag?
 	for k,list in pairs(poiTags) do
-		if list[obj:Find(k)] then
-			v = obj:Find(k)	-- k/v are the OSM tag pair
+		if list[Find(k)] then
+			v = Find(k)	-- k/v are the OSM tag pair
 			class = poiClasses[v] or v
 			rank  = poiClassRanks[class] or 25
 			return rank, class, v
@@ -512,7 +514,7 @@ function GetPOIRank(obj)
 	end
 
 	-- Catch-all for shops
-	local shop = obj:Find("shop")
+	local shop = Find("shop")
 	if shop~="" then return poiClassRanks['shop'], "shop", shop end
 
 	-- Nothing found
diff --git a/resources/process-example.lua b/resources/process-example.lua
index 41b461df..b4b1f108 100644
--- a/resources/process-example.lua
+++ b/resources/process-example.lua
@@ -14,33 +14,33 @@ end
 -- Assign nodes to a layer, and set attributes, based on OSM tags
 
 function node_function(node)
-	local amenity = node:Find("amenity")
-	local shop = node:Find("shop")
+	local amenity = Find("amenity")
+	local shop = Find("shop")
 	if amenity~="" or shop~="" then
-		node:Layer("poi", false)
-		if amenity~="" then node:Attribute("class",amenity)
-		else node:Attribute("class",shop) end
-		node:Attribute("name", node:Find("name"))
+		Layer("poi", false)
+		if amenity~="" then Attribute("class",amenity)
+		else Attribute("class",shop) end
+		Attribute("name", Find("name"))
 	end
 end
 
 -- Similarly for ways
 
-function way_function(way)
-	local highway = way:Find("highway")
-	local waterway = way:Find("waterway")
-	local building = way:Find("building")
+function way_function()
+	local highway = Find("highway")
+	local waterway = Find("waterway")
+	local building = Find("building")
 	if highway~="" then
-		way:Layer("transportation", false)
-		way:Attribute("class", highway)
---		way:Attribute("id",way:Id())
---		way:AttributeNumeric("area",37)
+		Layer("transportation", false)
+		Attribute("class", highway)
+--		Attribute("id",Id())
+--		AttributeNumeric("area",37)
 	end
 	if waterway~="" then
-		way:Layer("waterway", false)
-		way:Attribute("class", waterway)
+		Layer("waterway", false)
+		Attribute("class", waterway)
 	end
 	if building~="" then
-		way:Layer("building", true)
+		Layer("building", true)
 	end
 end
diff --git a/resources/process-openmaptiles.lua b/resources/process-openmaptiles.lua
index c7f74745..6ede9d26 100644
--- a/resources/process-openmaptiles.lua
+++ b/resources/process-openmaptiles.lua
@@ -118,36 +118,36 @@ function calcRank(place, population, capital_al)
 end
 
 
-function node_function(node)
+function node_function()
 	-- Write 'aerodrome_label'
-	local aeroway = node:Find("aeroway")
+	local aeroway = Find("aeroway")
 	if aeroway == "aerodrome" then
-		node:Layer("aerodrome_label", false)
-		SetNameAttributes(node)
-		node:Attribute("iata", node:Find("iata"))
-		SetEleAttributes(node)
-		node:Attribute("icao", node:Find("icao"))
+		Layer("aerodrome_label", false)
+		SetNameAttributes()
+		Attribute("iata", Find("iata"))
+		SetEleAttributes()
+		Attribute("icao", Find("icao"))
 
-		local aerodrome_value = node:Find("aerodrome")
+		local aerodrome_value = Find("aerodrome")
 		local class
 		if aerodromeValues[aerodrome_value] then class = aerodrome_value else class = "other" end
-		node:Attribute("class", class)
+		Attribute("class", class)
 	end
 	-- Write 'housenumber'
-	local housenumber = node:Find("addr:housenumber")
+	local housenumber = Find("addr:housenumber")
 	if housenumber~="" then
-		node:Layer("housenumber", false)
-		node:Attribute("housenumber", housenumber)
+		Layer("housenumber", false)
+		Attribute("housenumber", housenumber)
 	end
 
 	-- Write 'place'
 	-- note that OpenMapTiles has a rank for countries (1-3), states (1-6) and cities (1-10+);
 	--   we could potentially approximate it for cities based on the population tag
-	local place = node:Find("place")
+	local place = Find("place")
 	if place ~= "" then
 		local mz = 13
-		local pop = tonumber(node:Find("population")) or 0
-		local capital = capitalLevel(node:Find("capital"))
+		local pop = tonumber(Find("population")) or 0
+		local capital = capitalLevel(Find("capital"))
 		local rank = calcRank(place, pop, capital)
 
 		if     place == "continent"     then mz=0
@@ -167,33 +167,33 @@ function node_function(node)
 		elseif place == "locality"      then mz=13
 		end
 
-		node:Layer("place", false)
-		node:Attribute("class", place)
-		node:MinZoom(mz)
-		if rank then node:AttributeNumeric("rank", rank) end
-		if capital then node:AttributeNumeric("capital", capital) end
-		if place=="country" then node:Attribute("iso_a2", node:Find("ISO3166-1:alpha2")) end
-		SetNameAttributes(node)
+		Layer("place", false)
+		Attribute("class", place)
+		MinZoom(mz)
+		if rank then AttributeNumeric("rank", rank) end
+		if capital then AttributeNumeric("capital", capital) end
+		if place=="country" then Attribute("iso_a2", Find("ISO3166-1:alpha2")) end
+		SetNameAttributes()
 		return
 	end
 
 	-- Write 'poi'
-	local rank, class, subclass = GetPOIRank(node)
-	if rank then WritePOI(node,class,subclass,rank) end
+	local rank, class, subclass = GetPOIRank()
+	if rank then WritePOI(class,subclass,rank) end
 
 	-- Write 'mountain_peak' and 'water_name'
-	local natural = node:Find("natural")
+	local natural = Find("natural")
 	if natural == "peak" or natural == "volcano" then
-		node:Layer("mountain_peak", false)
-		SetEleAttributes(node)
-		node:AttributeNumeric("rank", 1)
-		node:Attribute("class", natural)
-		SetNameAttributes(node)
+		Layer("mountain_peak", false)
+		SetEleAttributes()
+		AttributeNumeric("rank", 1)
+		Attribute("class", natural)
+		SetNameAttributes()
 		return
 	end
 	if natural == "bay" then
-		node:Layer("water_name", false)
-		SetNameAttributes(node)
+		Layer("water_name", false)
+		SetNameAttributes()
 		return
 	end
 end
@@ -279,81 +279,81 @@ waterwayClasses = Set { "stream", "river", "canal", "drain", "ditch" }
 
 -- Scan relations for use in ways
 
-function relation_scan_function(relation)
-	if relation:Find("type")=="boundary" and relation:Find("boundary")=="administrative" then
-		relation:Accept()
+function relation_scan_function()
+	if Find("type")=="boundary" and Find("boundary")=="administrative" then
+		Accept()
 	end
 end
 
-function write_to_transportation_layer(way, minzoom, highway_class)
-	way:Layer("transportation", false)
-	way:MinZoom(minzoom)
-	SetZOrder(way)
-	way:Attribute("class", highway_class)
-	SetBrunnelAttributes(way)
-	if ramp then way:AttributeNumeric("ramp",1) end
+function write_to_transportation_layer(minzoom, highway_class)
+	Layer("transportation", false)
+	MinZoom(minzoom)
+	SetZOrder()
+	Attribute("class", highway_class)
+	SetBrunnelAttributes()
+	if ramp then AttributeNumeric("ramp",1) end
 
 	-- Service
-	if highway == "service" and service ~="" then way:Attribute("service", service) end
+	if highway == "service" and service ~="" then Attribute("service", service) end
 
-	local oneway = way:Find("oneway")
+	local oneway = Find("oneway")
 	if oneway == "yes" or oneway == "1" then
-		way:AttributeNumeric("oneway",1)
+		AttributeNumeric("oneway",1)
 	end
 	if oneway == "-1" then
 		-- **** TODO
 	end
-	local surface = way:Find("surface")
-        local surfaceMinzoom = 12
+	local surface = Find("surface")
+	local surfaceMinzoom = 12
 	if pavedValues[surface] then
-		way:Attribute("surface", "paved", surfaceMinzoom)
+		Attribute("surface", "paved", surfaceMinzoom)
 	elseif unpavedValues[surface] then
-		way:Attribute("surface", "unpaved", surfaceMinzoom)
-	end
-        local accessMinzoom = 9
-	if way:Holds("access") then way:Attribute("access", way:Find("access"), accessMinzoom) end
-	if way:Holds("bicycle") then way:Attribute("bicycle", way:Find("bicycle"), accessMinzoom) end
-	if way:Holds("foot") then way:Attribute("foot", way:Find("foot"), accessMinzoom) end
-	if way:Holds("horse") then way:Attribute("horse", way:Find("horse"), accessMinzoom) end
-	way:AttributeBoolean("toll", way:Find("toll") == "yes", accessMinzoom)
-	way:AttributeNumeric("layer", tonumber(way:Find("layer")) or 0, accessMinzoom)
-	way:AttributeBoolean("expressway", way:Find("expressway"), 7)
-	way:Attribute("mtb_scale", way:Find("mtb:scale"), 10)
+		Attribute("surface", "unpaved", surfaceMinzoom)
+	end
+	local accessMinzoom = 9
+	if Holds("access") then Attribute("access", Find("access"), accessMinzoom) end
+	if Holds("bicycle") then Attribute("bicycle", Find("bicycle"), accessMinzoom) end
+	if Holds("foot") then Attribute("foot", Find("foot"), accessMinzoom) end
+	if Holds("horse") then Attribute("horse", Find("horse"), accessMinzoom) end
+	AttributeBoolean("toll", Find("toll") == "yes", accessMinzoom)
+	AttributeNumeric("layer", tonumber(Find("layer")) or 0, accessMinzoom)
+	AttributeBoolean("expressway", Find("expressway"), 7)
+	Attribute("mtb_scale", Find("mtb:scale"), 10)
 end
 
 -- Process way tags
 
-function way_function(way)
-	local route    = way:Find("route")
-	local highway  = way:Find("highway")
-	local waterway = way:Find("waterway")
-	local water    = way:Find("water")
-	local building = way:Find("building")
-	local natural  = way:Find("natural")
-	local historic = way:Find("historic")
-	local landuse  = way:Find("landuse")
-	local leisure  = way:Find("leisure")
-	local amenity  = way:Find("amenity")
-	local aeroway  = way:Find("aeroway")
-	local railway  = way:Find("railway")
-	local service  = way:Find("service")
-	local sport    = way:Find("sport")
-	local shop     = way:Find("shop")
-	local tourism  = way:Find("tourism")
-	local man_made = way:Find("man_made")
-	local boundary = way:Find("boundary")
-	local isClosed = way:IsClosed()
-	local housenumber = way:Find("addr:housenumber")
+function way_function()
+	local route    = Find("route")
+	local highway  = Find("highway")
+	local waterway = Find("waterway")
+	local water    = Find("water")
+	local building = Find("building")
+	local natural  = Find("natural")
+	local historic = Find("historic")
+	local landuse  = Find("landuse")
+	local leisure  = Find("leisure")
+	local amenity  = Find("amenity")
+	local aeroway  = Find("aeroway")
+	local railway  = Find("railway")
+	local service  = Find("service")
+	local sport    = Find("sport")
+	local shop     = Find("shop")
+	local tourism  = Find("tourism")
+	local man_made = Find("man_made")
+	local boundary = Find("boundary")
+	local isClosed = IsClosed()
+	local housenumber = Find("addr:housenumber")
 	local write_name = false
-	local construction = way:Find("construction")
+	local construction = Find("construction")
 
 	-- Miscellaneous preprocessing
-	if way:Find("disused") == "yes" then return end
-	if boundary~="" and way:Find("protection_title")=="National Forest" and way:Find("operator")=="United States Forest Service" then return end
+	if Find("disused") == "yes" then return end
+	if boundary~="" and Find("protection_title")=="National Forest" and Find("operator")=="United States Forest Service" then return end
 	if highway == "proposed" then return end
 	if aerowayBuildings[aeroway] then building="yes"; aeroway="" end
 	if landuse == "field" then landuse = "farmland" end
-	if landuse == "meadow" and way:Find("meadow")=="agricultural" then landuse="farmland" end
+	if landuse == "meadow" and Find("meadow")=="agricultural" then landuse="farmland" end
 
 	-- Boundaries within relations
 	-- note that we process administrative boundaries as properties on ways, rather than as single relation geometries,
@@ -361,21 +361,21 @@ function way_function(way)
 	local admin_level = 11
 	local isBoundary = false
 	while true do
-		local rel = way:NextRelation()
+		local rel = NextRelation()
 		if not rel then break end
 		isBoundary = true
-		admin_level = math.min(admin_level, tonumber(way:FindInRelation("admin_level")) or 11)
+		admin_level = math.min(admin_level, tonumber(FindInRelation("admin_level")) or 11)
 	end
 
 	-- Boundaries in ways
 	if boundary=="administrative" then
-		admin_level = math.min(admin_level, tonumber(way:Find("admin_level")) or 11)
+		admin_level = math.min(admin_level, tonumber(Find("admin_level")) or 11)
 		isBoundary = true
 	end
 
 	-- Administrative boundaries
 	-- https://openmaptiles.org/schema/#boundary
-	if isBoundary and not (way:Find("maritime")=="yes") then
+	if isBoundary and not (Find("maritime")=="yes") then
 		local mz = 0
 		if     admin_level>=3 and admin_level<5 then mz=4
 		elseif admin_level>=5 and admin_level<7 then mz=8
@@ -383,22 +383,22 @@ function way_function(way)
 		elseif admin_level>=8 then mz=12
 		end
 
-		way:Layer("boundary",false)
-		way:AttributeNumeric("admin_level", admin_level)
-		way:MinZoom(mz)
+		Layer("boundary",false)
+		AttributeNumeric("admin_level", admin_level)
+		MinZoom(mz)
 		-- disputed status (0 or 1). some styles need to have the 0 to show it.
-		local disputed = way:Find("disputed")
+		local disputed = Find("disputed")
 		if disputed=="yes" then
-			way:AttributeNumeric("disputed", 1)
+			AttributeNumeric("disputed", 1)
 		else
-			way:AttributeNumeric("disputed", 0)
+			AttributeNumeric("disputed", 0)
 		end
 	end
 
 	-- Roads ('transportation' and 'transportation_name', plus 'transportation_name_detail')
 	if highway~="" then
-		local access = way:Find("access")
-		local surface = way:Find("surface")
+		local access = Find("access")
+		local surface = Find("surface")
 
 		local h = highway
 		local minzoom = 99
@@ -439,158 +439,158 @@ function way_function(way)
 
 		-- Write to layer
 		if minzoom <= 14 then
-			write_to_transportation_layer(way, minzoom, h)
+			write_to_transportation_layer(minzoom, h)
 
 			-- Write names
 			if minzoom < 8 then
 				minzoom = 8
 			end
 			if highway == "motorway" or highway == "trunk" then
-				way:Layer("transportation_name", false)
-				way:MinZoom(minzoom)
+				Layer("transportation_name", false)
+				MinZoom(minzoom)
 			elseif h == "minor" or h == "track" or h == "path" or h == "service" then
-				way:Layer("transportation_name_detail", false)
-				way:MinZoom(minzoom)
+				Layer("transportation_name_detail", false)
+				MinZoom(minzoom)
 			else
-				way:Layer("transportation_name_mid", false)
-				way:MinZoom(minzoom)
+				Layer("transportation_name_mid", false)
+				MinZoom(minzoom)
 			end
-			SetNameAttributes(way)
-			way:Attribute("class",h)
-			way:Attribute("network","road") -- **** could also be us-interstate, us-highway, us-state
-			if h~=highway then way:Attribute("subclass",highway) end
-			local ref = way:Find("ref")
+			SetNameAttributes()
+			Attribute("class",h)
+			Attribute("network","road") -- **** could also be us-interstate, us-highway, us-state
+			if h~=highway then Attribute("subclass",highway) end
+			local ref = Find("ref")
 			if ref~="" then
-				way:Attribute("ref",ref)
-				way:AttributeNumeric("ref_length",ref:len())
+				Attribute("ref",ref)
+				AttributeNumeric("ref_length",ref:len())
 			end
 		end
 	end
 
 	-- Railways ('transportation' and 'transportation_name', plus 'transportation_name_detail')
 	if railway~="" then
-		way:Layer("transportation", false)
-		way:Attribute("class", railway)
-		SetZOrder(way)
-		SetBrunnelAttributes(way)
+		Layer("transportation", false)
+		Attribute("class", railway)
+		SetZOrder()
+		SetBrunnelAttributes()
 		if service~="" then
-			way:Attribute("service", service)
-			way:MinZoom(12)
+			Attribute("service", service)
+			MinZoom(12)
 		else
-			way:MinZoom(9)
+			MinZoom(9)
 		end
 
-		way:Layer("transportation_name", false)
-		SetNameAttributes(way)
-		way:MinZoom(14)
-		way:Attribute("class", "rail")
+		Layer("transportation_name", false)
+		SetNameAttributes()
+		MinZoom(14)
+		Attribute("class", "rail")
 	end
 
 	-- Pier
 	if man_made=="pier" then
-		way:Layer("transportation", isClosed)
-		SetZOrder(way)
-		way:Attribute("class", "pier")
-		SetMinZoomByArea(way)
+		Layer("transportation", isClosed)
+		SetZOrder()
+		Attribute("class", "pier")
+		SetMinZoomByArea()
 	end
 
 	-- 'Ferry'
 	if route=="ferry" then
-		way:Layer("transportation", false)
-		way:Attribute("class", "ferry")
-		SetZOrder(way)
-		way:MinZoom(9)
-		SetBrunnelAttributes(way)
+		Layer("transportation", false)
+		Attribute("class", "ferry")
+		SetZOrder()
+		MinZoom(9)
+		SetBrunnelAttributes()
 
-		way:Layer("transportation_name", false)
-		SetNameAttributes(way)
-		way:MinZoom(12)
-		way:Attribute("class", "ferry")
+		Layer("transportation_name", false)
+		SetNameAttributes()
+		MinZoom(12)
+		Attribute("class", "ferry")
 	end
 
 	-- 'Aeroway'
 	if aeroway~="" then
-		way:Layer("aeroway", isClosed)
-		way:Attribute("class",aeroway)
-		way:Attribute("ref",way:Find("ref"))
+		Layer("aeroway", isClosed)
+		Attribute("class",aeroway)
+		Attribute("ref",Find("ref"))
 		write_name = true
 	end
 
 	-- 'aerodrome_label'
 	if aeroway=="aerodrome" then
-	 	way:LayerAsCentroid("aerodrome_label")
-	 	SetNameAttributes(way)
-	 	way:Attribute("iata", way:Find("iata"))
-  		SetEleAttributes(way)
- 	 	way:Attribute("icao", way:Find("icao"))
+	 	LayerAsCentroid("aerodrome_label")
+	 	SetNameAttributes()
+	 	Attribute("iata", Find("iata"))
+  		SetEleAttributes()
+ 	 	Attribute("icao", Find("icao"))
 
- 	 	local aerodrome = way:Find(aeroway)
+ 	 	local aerodrome = Find(aeroway)
  	 	local class
  	 	if aerodromeValues[aerodrome] then class = aerodrome else class = "other" end
- 	 	way:Attribute("class", class)
+ 	 	Attribute("class", class)
 	end
 
 	-- Set 'waterway' and associated
 	if waterwayClasses[waterway] and not isClosed then
-		if waterway == "river" and way:Holds("name") then
-			way:Layer("waterway", false)
+		if waterway == "river" and Holds("name") then
+			Layer("waterway", false)
 		else
-			way:Layer("waterway_detail", false)
+			Layer("waterway_detail", false)
 		end
-		if way:Find("intermittent")=="yes" then way:AttributeNumeric("intermittent", 1) else way:AttributeNumeric("intermittent", 0) end
-		way:Attribute("class", waterway)
-		SetNameAttributes(way)
-		SetBrunnelAttributes(way)
-	elseif waterway == "boatyard"  then way:Layer("landuse", isClosed); way:Attribute("class", "industrial"); way:MinZoom(12)
-	elseif waterway == "dam"       then way:Layer("building",isClosed)
-	elseif waterway == "fuel"      then way:Layer("landuse", isClosed); way:Attribute("class", "industrial"); way:MinZoom(14)
+		if Find("intermittent")=="yes" then AttributeNumeric("intermittent", 1) else AttributeNumeric("intermittent", 0) end
+		Attribute("class", waterway)
+		SetNameAttributes()
+		SetBrunnelAttributes()
+	elseif waterway == "boatyard"  then Layer("landuse", isClosed); Attribute("class", "industrial"); MinZoom(12)
+	elseif waterway == "dam"       then Layer("building",isClosed)
+	elseif waterway == "fuel"      then Layer("landuse", isClosed); Attribute("class", "industrial"); MinZoom(14)
 	end
 	-- Set names on rivers
 	if waterwayClasses[waterway] and not isClosed then
-		if waterway == "river" and way:Holds("name") then
-			way:Layer("water_name", false)
+		if waterway == "river" and Holds("name") then
+			Layer("water_name", false)
 		else
-			way:Layer("water_name_detail", false)
-			way:MinZoom(14)
+			Layer("water_name_detail", false)
+			MinZoom(14)
 		end
-		way:Attribute("class", waterway)
-		SetNameAttributes(way)
+		Attribute("class", waterway)
+		SetNameAttributes()
 	end
 
 	-- Set 'building' and associated
 	if building~="" then
-		way:Layer("building", true)
-		SetBuildingHeightAttributes(way)
-		SetMinZoomByArea(way)
+		Layer("building", true)
+		SetBuildingHeightAttributes()
+		SetMinZoomByArea()
 	end
 
 	-- Set 'housenumber'
 	if housenumber~="" then
-		way:LayerAsCentroid("housenumber", false)
-		way:Attribute("housenumber", housenumber)
+		LayerAsCentroid("housenumber", false)
+		Attribute("housenumber", housenumber)
 	end
 
 	-- Set 'water'
 	if natural=="water" or leisure=="swimming_pool" or landuse=="reservoir" or landuse=="basin" or waterClasses[waterway] then
-		if way:Find("covered")=="yes" or not isClosed then return end
+		if Find("covered")=="yes" or not isClosed then return end
 		local class="lake"; if waterway~="" then class="river" end
-		if class=="lake" and way:Find("wikidata")=="Q192770" then return end
-		way:Layer("water",true)
-		SetMinZoomByArea(way)
-		way:Attribute("class",class)
+		if class=="lake" and Find("wikidata")=="Q192770" then return end
+		Layer("water",true)
+		SetMinZoomByArea()
+		Attribute("class",class)
 
-		if way:Find("intermittent")=="yes" then way:Attribute("intermittent",1) end
+		if Find("intermittent")=="yes" then Attribute("intermittent",1) end
 		-- we only want to show the names of actual lakes not every man-made basin that probably doesn't even have a name other than "basin"
 		-- examples for which we don't want to show a name:
 		--  https://www.openstreetmap.org/way/25958687
 		--  https://www.openstreetmap.org/way/27201902
 		--  https://www.openstreetmap.org/way/25309134
 		--  https://www.openstreetmap.org/way/24579306
-		if way:Holds("name") and natural=="water" and water ~= "basin" and water ~= "wastewater" then
-			way:LayerAsCentroid("water_name_detail")
-			SetNameAttributes(way)
-			SetMinZoomByArea(way)
-			way:Attribute("class", class)
+		if Holds("name") and natural=="water" and water ~= "basin" and water ~= "wastewater" then
+			LayerAsCentroid("water_name_detail")
+			SetNameAttributes()
+			SetMinZoomByArea()
+			Attribute("class", class)
 		end
 
 		return -- in case we get any landuse processing
@@ -601,11 +601,11 @@ function way_function(way)
 	if l=="" then l=natural end
 	if l=="" then l=leisure end
 	if landcoverKeys[l] then
-		way:Layer("landcover", true)
-		SetMinZoomByArea(way)
-		way:Attribute("class", landcoverKeys[l])
-		if l=="wetland" then way:Attribute("subclass", way:Find("wetland"))
-		else way:Attribute("subclass", l) end
+		Layer("landcover", true)
+		SetMinZoomByArea()
+		Attribute("class", landcoverKeys[l])
+		if l=="wetland" then Attribute("subclass", Find("wetland"))
+		else Attribute("subclass", l) end
 		write_name = true
 
 	-- Set 'landuse'
@@ -613,31 +613,31 @@ function way_function(way)
 		if l=="" then l=amenity end
 		if l=="" then l=tourism end
 		if landuseKeys[l] then
-			way:Layer("landuse", true)
-			way:Attribute("class", l)
+			Layer("landuse", true)
+			Attribute("class", l)
 			if l=="residential" then
-				if way:Area()<ZRES8^2 then way:MinZoom(8)
-				else SetMinZoomByArea(way) end
-			else way:MinZoom(11) end
+				if Area()<ZRES8^2 then MinZoom(8)
+				else SetMinZoomByArea() end
+			else MinZoom(11) end
 			write_name = true
 		end
 	end
 
 	-- Parks
 	-- **** name?
-	if     boundary=="national_park" then way:Layer("park",true); way:Attribute("class",boundary); SetNameAttributes(way)
-	elseif leisure=="nature_reserve" then way:Layer("park",true); way:Attribute("class",leisure ); SetNameAttributes(way) end
+	if     boundary=="national_park" then Layer("park",true); Attribute("class",boundary); SetNameAttributes()
+	elseif leisure=="nature_reserve" then Layer("park",true); Attribute("class",leisure ); SetNameAttributes() end
 
 	-- POIs ('poi' and 'poi_detail')
-	local rank, class, subclass = GetPOIRank(way)
-	if rank then WritePOI(way,class,subclass,rank); return end
+	local rank, class, subclass = GetPOIRank()
+	if rank then WritePOI(class,subclass,rank); return end
 
 	-- Catch-all
-	if (building~="" or write_name) and way:Holds("name") then
-		way:LayerAsCentroid("poi_detail")
-		SetNameAttributes(way)
+	if (building~="" or write_name) and Holds("name") then
+		LayerAsCentroid("poi_detail")
+		SetNameAttributes()
 		if write_name then rank=6 else rank=25 end
-		way:AttributeNumeric("rank", rank)
+		AttributeNumeric("rank", rank)
 	end
 end
 
@@ -658,112 +658,112 @@ end
 -- Common functions
 
 -- Write a way centroid to POI layer
-function WritePOI(obj,class,subclass,rank)
+function WritePOI(class,subclass,rank)
 	local layer = "poi"
 	if rank>4 then layer="poi_detail" end
-	obj:LayerAsCentroid(layer)
-	SetNameAttributes(obj)
-	obj:AttributeNumeric("rank", rank)
-	obj:Attribute("class", class)
-	obj:Attribute("subclass", subclass)
+	LayerAsCentroid(layer)
+	SetNameAttributes()
+	AttributeNumeric("rank", rank)
+	Attribute("class", class)
+	Attribute("subclass", subclass)
 	-- layer defaults to 0
-	obj:AttributeNumeric("layer", tonumber(obj:Find("layer")) or 0)
+	AttributeNumeric("layer", tonumber(Find("layer")) or 0)
 	-- indoor defaults to false
-	obj:AttributeBoolean("indoor", (obj:Find("indoor") == "yes"))
+	AttributeBoolean("indoor", (Find("indoor") == "yes"))
 	-- level has no default
-	local level = tonumber(obj:Find("level"))
+	local level = tonumber(Find("level"))
 	if level then
-		obj:AttributeNumeric("level", level)
+		AttributeNumeric("level", level)
 	end
 end
 
 -- Set name attributes on any object
-function SetNameAttributes(obj)
-	local name = obj:Find("name"), iname
+function SetNameAttributes()
+	local name = Find("name"), iname
 	local main_written = name
 	-- if we have a preferred language, then write that (if available), and additionally write the base name tag
-	if preferred_language and obj:Holds("name:"..preferred_language) then
-		iname = obj:Find("name:"..preferred_language)
-		obj:Attribute(preferred_language_attribute, iname)
+	if preferred_language and Holds("name:"..preferred_language) then
+		iname = Find("name:"..preferred_language)
+		Attribute(preferred_language_attribute, iname)
 		if iname~=name and default_language_attribute then
-			obj:Attribute(default_language_attribute, name)
+			Attribute(default_language_attribute, name)
 		else main_written = iname end
 	else
-		obj:Attribute(preferred_language_attribute, name)
+		Attribute(preferred_language_attribute, name)
 	end
 	-- then set any additional languages
 	for i,lang in ipairs(additional_languages) do
-		iname = obj:Find("name:"..lang)
+		iname = Find("name:"..lang)
 		if iname=="" then iname=name end
-		if iname~=main_written then obj:Attribute("name:"..lang, iname) end
+		if iname~=main_written then Attribute("name:"..lang, iname) end
 	end
 end
 
 -- Set ele and ele_ft on any object
-function SetEleAttributes(obj)
-    local ele = obj:Find("ele")
+function SetEleAttributes()
+    local ele = Find("ele")
 	if ele ~= "" then
 		local meter = math.floor(tonumber(ele) or 0)
 		local feet = math.floor(meter * 3.2808399)
-		obj:AttributeNumeric("ele", meter)
-		obj:AttributeNumeric("ele_ft", feet)
+		AttributeNumeric("ele", meter)
+		AttributeNumeric("ele_ft", feet)
     end
 end
 
-function SetBrunnelAttributes(obj)
-	if     obj:Find("bridge") == "yes" then obj:Attribute("brunnel", "bridge")
-	elseif obj:Find("tunnel") == "yes" then obj:Attribute("brunnel", "tunnel")
-	elseif obj:Find("ford")   == "yes" then obj:Attribute("brunnel", "ford")
+function SetBrunnelAttributes()
+	if     Find("bridge") == "yes" then Attribute("brunnel", "bridge")
+	elseif Find("tunnel") == "yes" then Attribute("brunnel", "tunnel")
+	elseif Find("ford")   == "yes" then Attribute("brunnel", "ford")
 	end
 end
 
 -- Set minimum zoom level by area
-function SetMinZoomByArea(way)
-	local area=way:Area()
-	if     area>ZRES5^2  then way:MinZoom(6)
-	elseif area>ZRES6^2  then way:MinZoom(7)
-	elseif area>ZRES7^2  then way:MinZoom(8)
-	elseif area>ZRES8^2  then way:MinZoom(9)
-	elseif area>ZRES9^2  then way:MinZoom(10)
-	elseif area>ZRES10^2 then way:MinZoom(11)
-	elseif area>ZRES11^2 then way:MinZoom(12)
-	elseif area>ZRES12^2 then way:MinZoom(13)
-	else                      way:MinZoom(14) end
+function SetMinZoomByArea()
+	local area=Area()
+	if     area>ZRES5^2  then MinZoom(6)
+	elseif area>ZRES6^2  then MinZoom(7)
+	elseif area>ZRES7^2  then MinZoom(8)
+	elseif area>ZRES8^2  then MinZoom(9)
+	elseif area>ZRES9^2  then MinZoom(10)
+	elseif area>ZRES10^2 then MinZoom(11)
+	elseif area>ZRES11^2 then MinZoom(12)
+	elseif area>ZRES12^2 then MinZoom(13)
+	else                      MinZoom(14) end
 end
 
 -- Calculate POIs (typically rank 1-4 go to 'poi' z12-14, rank 5+ to 'poi_detail' z14)
 -- returns rank, class, subclass
-function GetPOIRank(obj)
+function GetPOIRank()
 	local k,list,v,class,rank
 
 	-- Can we find the tag?
 	for k,list in pairs(poiTags) do
-		if list[obj:Find(k)] then
-			v = obj:Find(k)	-- k/v are the OSM tag pair
+		if list[Find(k)] then
+			v = Find(k)	-- k/v are the OSM tag pair
 			class = poiClasses[v] or k
 			rank  = poiClassRanks[class] or 25
 			subclassKey = poiSubClasses[v]
 			if subclassKey then
 				class = v
-				v = obj:Find(subclassKey)
+				v = Find(subclassKey)
 			end
 			return rank, class, v
 		end
 	end
 
 	-- Catch-all for shops
-	local shop = obj:Find("shop")
+	local shop = Find("shop")
 	if shop~="" then return poiClassRanks['shop'], "shop", shop end
 
 	-- Nothing found
 	return nil,nil,nil
 end
 
-function SetBuildingHeightAttributes(way)
-	local height = tonumber(way:Find("height"), 10)
-	local minHeight = tonumber(way:Find("min_height"), 10)
-	local levels = tonumber(way:Find("building:levels"), 10)
-	local minLevel = tonumber(way:Find("building:min_level"), 10)
+function SetBuildingHeightAttributes()
+	local height = tonumber(Find("height"), 10)
+	local minHeight = tonumber(Find("min_height"), 10)
+	local levels = tonumber(Find("building:levels"), 10)
+	local minLevel = tonumber(Find("building:min_level"), 10)
 
 	local renderHeight = BUILDING_FLOOR_HEIGHT
 	if height or levels then
@@ -779,17 +779,17 @@ function SetBuildingHeightAttributes(way)
 		renderHeight = renderHeight + renderMinHeight
 	end
 
-	way:AttributeNumeric("render_height", renderHeight)
-	way:AttributeNumeric("render_min_height", renderMinHeight)
+	AttributeNumeric("render_height", renderHeight)
+	AttributeNumeric("render_min_height", renderMinHeight)
 end
 
 -- Implement z_order as calculated by Imposm
 -- See https://imposm.org/docs/imposm3/latest/mapping.html#wayzorder for details.
-function SetZOrder(way)
-	local highway = way:Find("highway")
-	local layer = tonumber(way:Find("layer"))
-	local bridge = way:Find("bridge")
-	local tunnel = way:Find("tunnel")
+function SetZOrder()
+	local highway = Find("highway")
+	local layer = tonumber(Find("layer"))
+	local bridge = Find("bridge")
+	local tunnel = Find("tunnel")
 	local zOrder = 0
 	if bridge ~= "" and bridge ~= "no" then
 		zOrder = zOrder + 10
@@ -820,7 +820,7 @@ function SetZOrder(way)
 		hwClass = 3
 	end
 	zOrder = zOrder + hwClass
-	way:ZOrder(zOrder)
+	ZOrder(zOrder)
 end
 
 -- ==========================================================
diff --git a/src/attribute_store.cpp b/src/attribute_store.cpp
index f4f9f299..363d167b 100644
--- a/src/attribute_store.cpp
+++ b/src/attribute_store.cpp
@@ -55,19 +55,38 @@ const std::string& AttributeKeyStore::getKeyUnsafe(uint16_t index) const {
 	return keys[index];
 }
 
+// AttributePair
+void AttributePair::ensureStringIsOwned() {
+	// Before we store an AttributePair in our long-term storage, we need
+	// to make sure it's not pointing to a non-long-lived std::string.
+	if (valueType == AttributePairType::Bool || valueType == AttributePairType::Float)
+		return;
+
+	stringValue_.ensureStringIsOwned();
+}
+
 // AttributePairStore
-thread_local boost::container::flat_map<const AttributePair*, uint32_t, AttributePairStore::key_value_less_ptr> tlsHotShardMap;
-thread_local uint16_t tlsHotShardSize = 0;
+thread_local DequeMap<AttributePair> tlsHotShard(1 << 16);
 const AttributePair& AttributePairStore::getPair(uint32_t i) const {
 	uint32_t shard = i >> (32 - SHARD_BITS);
 	uint32_t offset = i & (~(~0u << (32 - SHARD_BITS)));
 
-	if (shard == 0)
-		return hotShard[offset];
+	if (shard == 0) {
+		if (offset < tlsHotShard.size())
+			return tlsHotShard[offset];
+
+		{
+			std::lock_guard<std::mutex> lock(pairsMutex[0]);
+			tlsHotShard = pairs[0];
+		}
+
+		return tlsHotShard[offset];
+	}
 
 	std::lock_guard<std::mutex> lock(pairsMutex[shard]);
-	return pairs[shard].at(offset);
+	return pairs[shard][offset];
 };
+
 const AttributePair& AttributePairStore::getPairUnsafe(uint32_t i) const {
 	// NB: This is unsafe if called before the PBF has been fully read.
 	// If called during the output phase, it's safe.
@@ -75,44 +94,43 @@ const AttributePair& AttributePairStore::getPairUnsafe(uint32_t i) const {
 	uint32_t shard = i >> (32 - SHARD_BITS);
 	uint32_t offset = i & (~(~0u << (32 - SHARD_BITS)));
 
-	if (shard == 0)
-		return hotShard[offset];
-
-	return pairs[shard].at(offset);
+	return pairs[shard][offset];
 };
 
-uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
+// Remember recently queried/added pairs so that we can return them in the
+// future without taking a lock.
+thread_local uint64_t tlsPairLookups = 0;
+thread_local uint64_t tlsPairLookupsUncached = 0;
+
+thread_local std::vector<const AttributePair*> cachedAttributePairPointers(64);
+thread_local std::vector<uint32_t> cachedAttributePairIndexes(64);
+uint32_t AttributePairStore::addPair(AttributePair& pair, bool isHot) {
 	if (isHot) {
 		{
 			// First, check our thread-local map.
-			const auto& it = tlsHotShardMap.find(&pair);
-			if (it != tlsHotShardMap.end())
-				return it->second;
+			const auto& index = tlsHotShard.find(pair);
+			if (index != -1)
+				return index;
 		}
+
 		// Not found, ensure our local map is up-to-date for future calls,
 		// and fall through to the main map.
-		//
-		// Note that we can read `hotShard` without a lock
-		while (tlsHotShardSize < hotShardSize.load()) {
-			tlsHotShardSize++;
-			tlsHotShardMap[&hotShard[tlsHotShardSize]] = tlsHotShardSize;
+		if (!tlsHotShard.full()) {
+			std::lock_guard<std::mutex> lock(pairsMutex[0]);
+			tlsHotShard = pairs[0];
 		}
 
 		// This might be a popular pair, worth re-using.
 		// Have we already assigned it a hot ID?
 		std::lock_guard<std::mutex> lock(pairsMutex[0]);
-		const auto& it = pairsMaps[0].find(&pair);
-		if (it != pairsMaps[0].end())
-			return it->second;
+		const auto& index = pairs[0].find(pair);
+		if (index != -1)
+			return index;
 
-		if (hotShardSize.load() < 1 << 16) {
-			hotShardSize++;
-			uint32_t offset = hotShardSize.load();
-
-			hotShard[offset] = pair;
-			const AttributePair* ptr = &hotShard[offset];
+		if (!pairs[0].full()) {
+			pair.ensureStringIsOwned();
+			uint32_t offset = pairs[0].add(pair);
 			uint32_t rv = (0 << (32 - SHARD_BITS)) + offset;
-			pairsMaps[0][ptr] = rv;
 			return rv;
 		}
 	}
@@ -121,6 +139,23 @@ uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
 	// Throw it on the pile with the rest of the pairs.
 	size_t hash = pair.hash();
 
+	const size_t candidateIndex = hash % cachedAttributePairPointers.size();
+	// Before taking a lock, see if we've seen this attribute pair recently.
+
+	tlsPairLookups++;
+	if (tlsPairLookups % 1024 == 0) {
+		lookups += 1024;
+	}
+
+
+	{
+		const AttributePair* candidate = cachedAttributePairPointers[candidateIndex];
+
+		if (candidate != nullptr && *candidate == pair)
+			return cachedAttributePairIndexes[candidateIndex];
+	}
+
+
 	size_t shard = hash % ATTRIBUTE_SHARDS;
 	// Shard 0 is for hot pairs -- pick another shard if it gets selected.
 	if (shard == 0) shard = (hash >> 8) % ATTRIBUTE_SHARDS;
@@ -129,20 +164,27 @@ uint32_t AttributePairStore::addPair(const AttributePair& pair, bool isHot) {
 	if (shard == 0) shard = 1;
 
 	std::lock_guard<std::mutex> lock(pairsMutex[shard]);
-	const auto& it = pairsMaps[shard].find(&pair);
-	if (it != pairsMaps[shard].end())
-		return it->second;
 
-	uint32_t offset = pairs[shard].size();
+	tlsPairLookupsUncached++;
+	if (tlsPairLookupsUncached % 1024 == 0)
+		lookupsUncached += 1024;
+
+	const auto& index = pairs[shard].find(pair);
+	if (index != -1) {
+		const uint32_t rv = (shard << (32 - SHARD_BITS)) + index;
+		cachedAttributePairPointers[candidateIndex] = &pairs[shard][index];
+		cachedAttributePairIndexes[candidateIndex] = rv;
+
+		return rv;
+	}
+
+	pair.ensureStringIsOwned();
+	uint32_t offset = pairs[shard].add(pair);
 
 	if (offset >= (1 << (32 - SHARD_BITS)))
 		throw std::out_of_range("pair shard overflow");
 
-	pairs[shard].push_back(pair);
-	const AttributePair* ptr = &pairs[shard][offset];
 	uint32_t rv = (shard << (32 - SHARD_BITS)) + offset;
-
-	pairsMaps[shard][ptr] = rv;
 	return rv;
 };
 
@@ -199,21 +241,19 @@ void AttributeSet::removePairWithKey(const AttributePairStore& pairStore, uint32
 }
 
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, const std::string& v, char minzoom) {
-	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
-	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
+	PooledString ps(&v);
+	AttributePair kv(keyStore.key2index(key), ps, minzoom);
+	bool isHot = AttributePair::isHot(key, v);
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, bool v, char minzoom) {
 	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
-	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
+	bool isHot = true; // All bools are eligible to be hot pairs
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
 void AttributeStore::addAttribute(AttributeSet& attributeSet, std::string const &key, float v, char minzoom) {
 	AttributePair kv(keyStore.key2index(key),v,minzoom);
-	bool isHot = AttributePair::isHot(kv, key);
-	attributeSet.removePairWithKey(pairStore, kv.keyIndex);
+	bool isHot = v >= 0 && v <= 25 && ceil(v) == v; // Whole numbers in 0..25 are eligible to be hot pairs
 	attributeSet.addPair(pairStore.addPair(kv, isHot));
 }
 
@@ -254,33 +294,54 @@ void AttributeSet::finalize() {
 }
 
 
+// Remember recently queried/added sets so that we can return them in the
+// future without taking a lock.
+thread_local std::vector<const AttributeSet*> cachedAttributeSetPointers(64);
+thread_local std::vector<AttributeIndex> cachedAttributeSetIndexes(64);
+
+thread_local uint64_t tlsSetLookups = 0;
+thread_local uint64_t tlsSetLookupsUncached = 0;
 AttributeIndex AttributeStore::add(AttributeSet &attributes) {
 	// TODO: there's probably a way to use C++ types to distinguish a finalized
 	// and non-finalized AttributeSet, which would make this safer.
 	attributes.finalize();
 
 	size_t hash = attributes.hash();
+
+	const size_t candidateIndex = hash % cachedAttributeSetPointers.size();
+	// Before taking a lock, see if we've seen this attribute set recently.
+
+	tlsSetLookups++;
+	if (tlsSetLookups % 1024 == 0) {
+		lookups += 1024;
+	}
+
+
+	{
+		const AttributeSet* candidate = cachedAttributeSetPointers[candidateIndex];
+
+		if (candidate != nullptr && *candidate == attributes)
+			return cachedAttributeSetIndexes[candidateIndex];
+	}
+
 	size_t shard = hash % ATTRIBUTE_SHARDS;
 
 	// We can't use the top 2 bits (see OutputObject's bitfields)
 	shard = shard >> 2;
 
 	std::lock_guard<std::mutex> lock(setsMutex[shard]);
-	lookups++;
-
-	// Do we already have it?
-	const auto& existing = setsMaps[shard].find(&attributes);
-	if (existing != setsMaps[shard].end()) return existing->second;
+	tlsSetLookupsUncached++;
+	if (tlsSetLookupsUncached % 1024 == 0)
+		lookupsUncached += 1024;
 
-	// No, so add and return the index
-	uint32_t offset = sets[shard].size();
+	const uint32_t offset = sets[shard].add(attributes);
 	if (offset >= (1 << (32 - SHARD_BITS)))
 		throw std::out_of_range("set shard overflow");
-	sets[shard].push_back(attributes);
 
-	const AttributeSet* ptr = &sets[shard][offset];
 	uint32_t rv = (shard << (32 - SHARD_BITS)) + offset;
-	setsMaps[shard][ptr] = rv;
+
+	cachedAttributeSetPointers[candidateIndex] = &sets[shard][offset];
+	cachedAttributeSetIndexes[candidateIndex] = rv;
 	return rv;
 }
 
@@ -307,16 +368,21 @@ std::vector<const AttributePair*> AttributeStore::getUnsafe(AttributeIndex index
 	}
 }
 
-void AttributeStore::reportSize() const {
+size_t AttributeStore::size() const {
 	size_t numAttributeSets = 0;
 	for (int i = 0; i < ATTRIBUTE_SHARDS; i++)
 		numAttributeSets += sets[i].size();
-	std::cout << "Attributes: " << numAttributeSets << " sets from " << lookups.load() << " objects" << std::endl;
+
+	return numAttributeSets;
+}
+
+void AttributeStore::reportSize() const {
+	std::cout << "Attributes: " << size() << " sets from " << lookups.load() << " objects (" << lookupsUncached.load() << " uncached), " << pairStore.lookups.load() << " pairs (" << pairStore.lookupsUncached.load() << " uncached)" << std::endl;
 
 	// Print detailed histogram of frequencies of attributes.
 	if (false) {
 		for (int i = 0; i < ATTRIBUTE_SHARDS; i++) {
-			std::cout << "pairsMaps[" << i << "] has " << pairStore.pairsMaps[i].size() << " entries" << std::endl;
+			std::cout << "pairs[" << i << "] has " << pairStore.pairs[i].size() << " entries" << std::endl;
 		}
 
 		std::map<uint32_t, uint32_t> tagCountDist;
@@ -368,6 +434,20 @@ void AttributeStore::reportSize() const {
 	}
 }
 
+void AttributeStore::reset() {
+	// This is only used for tests.
+	tlsKeys2Index.clear();
+	tlsKeys2IndexSize = 0;
+
+	tlsHotShard.clear();
+
+	for (int i = 0; i < cachedAttributeSetPointers.size(); i++)
+		cachedAttributeSetPointers[i] = nullptr;
+
+	for (int i = 0; i < cachedAttributePairPointers.size(); i++)
+		cachedAttributePairPointers[i] = nullptr;
+}
+
 void AttributeStore::finalize() {
 	finalized = true;
 	keyStore.finalize();
diff --git a/src/helpers.cpp b/src/helpers.cpp
index 444ddcf0..df210b95 100644
--- a/src/helpers.cpp
+++ b/src/helpers.cpp
@@ -4,6 +4,8 @@
 #include <iomanip>
 #include <sstream>
 #include <cstring>
+#include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
 
 #include "helpers.h"
 
@@ -11,7 +13,6 @@
 #define MOD_GZIP_ZLIB_CFACTOR 9
 #define MOD_GZIP_ZLIB_BSIZE 8096
 
-namespace geom = boost::geometry;
 using namespace std;
 
 // Bounding box string parsing
@@ -89,7 +90,9 @@ std::string compress_string(const std::string& str,
 }
 
 // Decompress an STL string using zlib and return the original data.
-std::string decompress_string(const std::string& str, bool asGzip) {
+// The output buffer is passed in; callers are meant to re-use the buffer such
+// that eventually no allocations are needed when decompressing.
+void decompress_string(std::string& output, const char* input, uint32_t inputSize, bool asGzip) {
     z_stream zs;                        // z_stream is zlib's control structure
     memset(&zs, 0, sizeof(zs));
 
@@ -101,27 +104,27 @@ std::string decompress_string(const std::string& str, bool asGzip) {
 			throw(std::runtime_error("inflateInit failed while decompressing."));
 	}
 
-    zs.next_in = (Bytef*)str.data();
-    zs.avail_in = str.size();
+    zs.next_in = (Bytef*)input;
+    zs.avail_in = inputSize;
 
     int ret;
-    char outbuffer[32768];
-    std::string outstring;
+
+    int actualOutputSize = 0;
 
     // get the decompressed bytes blockwise using repeated calls to inflate
     do {
-        zs.next_out = reinterpret_cast<Bytef*>(outbuffer);
-        zs.avail_out = sizeof(outbuffer);
+        if (output.size() < actualOutputSize + 32768)
+            output.resize(actualOutputSize + 32768);
 
-        ret = inflate(&zs, 0);
+        zs.next_out = reinterpret_cast<Bytef*>(&output[actualOutputSize]);
+        zs.avail_out = output.size() - actualOutputSize;
 
-        if (outstring.size() < zs.total_out) {
-            outstring.append(outbuffer,
-                             zs.total_out - outstring.size());
-        }
+        ret = inflate(&zs, 0);
 
+        actualOutputSize = zs.total_out;
     } while (ret == Z_OK);
 
+    output.resize(actualOutputSize);
     inflateEnd(&zs);
 
     if (ret != Z_STREAM_END) {          // an error occurred that was not EOF
@@ -130,8 +133,6 @@ std::string decompress_string(const std::string& str, bool asGzip) {
             << zs.msg;
         throw(std::runtime_error(oss.str()));
     }
-
-    return outstring;
 }
 
 // Parse a Boost error
diff --git a/src/mmap_allocator.cpp b/src/mmap_allocator.cpp
index dc71f687..2b5e26fd 100644
--- a/src/mmap_allocator.cpp
+++ b/src/mmap_allocator.cpp
@@ -79,10 +79,10 @@ thread_local mmap_shm_ptr mmap_shm_thread_region_ptr;
 std::mutex mmap_allocator_mutex;
 
 mmap_file::mmap_file(std::string const &filename, std::size_t offset)
-	: mapping(filename.c_str(), boost::interprocess::read_write)
+	: filename(filename)
+	, mapping(filename.c_str(), boost::interprocess::read_write)
 	, region(mapping, boost::interprocess::read_write)
 	, buffer(boost::interprocess::create_only, reinterpret_cast<uint8_t *>(region.get_address()) + offset, region.get_size() - offset)
-	, filename(filename)
 { }
 
 mmap_file::~mmap_file()
diff --git a/src/node_stores.cpp b/src/node_stores.cpp
index 8c84b811..06e2fc5e 100644
--- a/src/node_stores.cpp
+++ b/src/node_stores.cpp
@@ -14,6 +14,17 @@ void BinarySearchNodeStore::reopen()
 	}
 }
 
+bool BinarySearchNodeStore::contains(size_t shard, NodeID i) const {
+	auto internalShard = mLatpLons[shardPart(i)];
+	auto id = idPart(i);
+
+	auto iter = std::lower_bound(internalShard->begin(), internalShard->end(), id, [](auto const &e, auto i) { 
+		return e.first < i; 
+	});
+
+	return !(iter == internalShard->end() || iter->first != id);
+}
+
 LatpLon BinarySearchNodeStore::at(NodeID i) const {
 	auto shard = mLatpLons[shardPart(i)];
 	auto id = idPart(i);
diff --git a/src/options_parser.cpp b/src/options_parser.cpp
new file mode 100644
index 00000000..529e5f4a
--- /dev/null
+++ b/src/options_parser.cpp
@@ -0,0 +1,114 @@
+#include "options_parser.h"
+
+#include <thread>
+#include <boost/filesystem.hpp>
+#include <boost/program_options.hpp>
+#include <iostream>
+#include "helpers.h"
+
+#ifndef TM_VERSION
+#define TM_VERSION (version not set)
+#endif
+#define STR1(x)  #x
+#define STR(x)  STR1(x)
+
+using namespace std;
+namespace po = boost::program_options;
+
+po::options_description getParser(OptionsParser::Options& options) {
+	po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options");
+	desc.add_options()
+		("help",                                                                 "show help message")
+		("input",  po::value< vector<string> >(&options.inputFiles),                     "source .osm.pbf file")
+		("output", po::value< string >(&options.outputFile),                             "target directory or .mbtiles/.pmtiles file")
+		("bbox",   po::value< string >(&options.bbox),                                   "bounding box to use if input file does not have a bbox header set, example: minlon,minlat,maxlon,maxlat")
+		("merge"  ,po::bool_switch(&options.mergeSqlite),                                "merge with existing .mbtiles (overwrites otherwise)")
+		("config", po::value< string >(&options.jsonFile)->default_value("config.json"), "config JSON file")
+		("process",po::value< string >(&options.luaFile)->default_value("process.lua"),  "tag-processing Lua file")
+		("verbose",po::bool_switch(&options.verbose),                                   "verbose error output")
+		("skip-integrity",po::bool_switch(&options.osm.skipIntegrity),                       "don't enforce way/node integrity")
+		("log-tile-timings", po::bool_switch(&options.logTileTimings), "log how long each tile takes");
+	po::options_description performance("Performance options");
+	performance.add_options()
+		("store",  po::value< string >(&options.osm.storeFile),  "temporary storage for node/ways/relations data")
+		("fast",   po::bool_switch(&options.osm.fast), "prefer speed at the expense of memory")
+		("compact",po::bool_switch(&options.osm.compact),  "use faster data structure for node lookups\nNOTE: This requires the input to be renumbered (osmium renumber)")
+		("no-compress-nodes", po::bool_switch(&options.osm.uncompressedNodes),  "store nodes uncompressed")
+		("no-compress-ways", po::bool_switch(&options.osm.uncompressedWays),  "store ways uncompressed")
+		("lazy-geometries", po::bool_switch(&options.osm.lazyGeometries),  "generate geometries from the OSM stores; uses less memory")
+		("materialize-geometries", po::bool_switch(&options.osm.materializeGeometries),  "materialize geometries; uses more memory")
+		("shard-stores", po::bool_switch(&options.osm.shardStores),  "use an alternate reading/writing strategy for low-memory machines")
+		("threads",po::value<uint32_t>(&options.threadNum)->default_value(0),              "number of threads (automatically detected if 0)")
+			;
+
+	desc.add(performance);
+	return desc;
+}
+
+void OptionsParser::showHelp() {
+	Options options;
+	auto parser = getParser(options);
+	std::cout << parser << std::endl;
+}
+
+OptionsParser::Options OptionsParser::parse(const int argc, const char* argv[]) {
+	Options options;
+
+	po::options_description desc = getParser(options);
+	po::positional_options_description p;
+	p.add("input", 1).add("output", 1);
+
+	po::variables_map vm;
+	try {
+		po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
+	} catch (const po::unknown_option& ex) {
+		throw OptionException{"Unknown option: " + ex.get_option_name()};
+	}
+	po::notify(vm);
+
+	if (options.osm.storeFile.empty()) {
+		options.osm.materializeGeometries = true;
+	} else {
+		if (!options.osm.fast) {
+			options.osm.shardStores = true;
+		}
+	}
+
+	// You can pass --lazy-geometries to override the default of materialized geometries for
+	// the non-store case.
+	if (options.osm.lazyGeometries)
+		options.osm.materializeGeometries = false;
+
+	
+	if (vm.count("help")) {
+		options.showHelp = true;
+		return options;
+	}
+	if (vm.count("output") == 0) {
+		throw OptionException{ "You must specify an output file or directory. Run with --help to find out more." };
+	}
+
+	if (vm.count("input") == 0) {
+		throw OptionException{ "No source .osm.pbf file supplied" };
+	}
+
+	if (ends_with(options.outputFile, ".mbtiles") || ends_with(options.outputFile, ".sqlite")) {
+		options.outputMode = OutputMode::MBTiles;
+	} else if (ends_with(options.outputFile, ".pmtiles")) {
+		options.outputMode = OutputMode::PMTiles;
+	}
+
+	if (options.threadNum == 0) {
+		options.threadNum = max(thread::hardware_concurrency(), 1u);
+	}
+
+	// ---- Check config
+	if (!boost::filesystem::exists(options.jsonFile)) {
+		throw OptionException{ "Couldn't open .json config: " + options.jsonFile };
+	}
+	if (!boost::filesystem::exists(options.luaFile)) {
+		throw OptionException{"Couldn't open .lua script: " + options.luaFile };
+	}
+
+	return options;
+}
diff --git a/src/osm_lua_processing.cpp b/src/osm_lua_processing.cpp
index a1bc2536..31d184ed 100644
--- a/src/osm_lua_processing.cpp
+++ b/src/osm_lua_processing.cpp
@@ -3,15 +3,132 @@
 #include "osm_lua_processing.h"
 #include "attribute_store.h"
 #include "helpers.h"
+#include "tag_map.h"
 #include "coordinates_geom.h"
 #include "osm_mem_tiles.h"
 
 
 using namespace std;
 
+const std::string EMPTY_STRING = "";
 thread_local kaguya::State *g_luaState = nullptr;
+thread_local OsmLuaProcessing* osmLuaProcessing = nullptr;
+
+// A key in `currentTags`. If Lua code refers to an absent key,
+// found will be false.
+struct KnownTagKey {
+	bool found;
+	uint32_t index;
+};
+
+template<>  struct kaguya::lua_type_traits<KnownTagKey> {
+	typedef KnownTagKey get_type;
+	typedef const KnownTagKey& push_type;
+
+	static bool strictCheckType(lua_State* l, int index)
+	{
+		return lua_type(l, index) == LUA_TSTRING;
+	}
+	static bool checkType(lua_State* l, int index)
+	{
+		return lua_isstring(l, index) != 0;
+	}
+	static get_type get(lua_State* l, int index)
+	{
+		KnownTagKey rv = { false, 0 };
+		size_t size = 0;
+		const char* buffer = lua_tolstring(l, index, &size);
+
+		int64_t tagLoc = osmLuaProcessing->currentTags->getKey(buffer, size);
+
+		if (tagLoc >= 0) {
+			rv.found = true;
+			rv.index = tagLoc;
+		}
+//		std::string key(buffer, size);
+//		std::cout << "for key " << key << ": rv.found=" << rv.found << ", rv.index=" << rv.index << std::endl;
+		return rv;
+	}
+	static int push(lua_State* l, push_type s)
+	{
+		throw std::runtime_error("Lua code doesn't know how to use KnownTagKey");
+	}
+};
+
+template<>  struct kaguya::lua_type_traits<PossiblyKnownTagValue> {
+	typedef PossiblyKnownTagValue get_type;
+	typedef const PossiblyKnownTagValue& push_type;
+
+	static bool strictCheckType(lua_State* l, int index)
+	{
+		return lua_type(l, index) == LUA_TSTRING;
+	}
+	static bool checkType(lua_State* l, int index)
+	{
+		return lua_isstring(l, index) != 0;
+	}
+	static get_type get(lua_State* l, int index)
+	{
+		PossiblyKnownTagValue rv = { false, 0 };
+		size_t size = 0;
+		const char* buffer = lua_tolstring(l, index, &size);
+
+		// For long strings where we might need to do a malloc, see if we
+		// can instead pass a pointer to a value from this object's tag
+		// map.
+		//
+		// 15 is the threshold where gcc no longer applies the small string
+		// optimization.
+		if (size > 15) {
+			int64_t tagLoc = osmLuaProcessing->currentTags->getValue(buffer, size);
+
+			if (tagLoc >= 0) {
+				rv.found = true;
+				rv.index = tagLoc;
+				return rv;
+			}
+		}
+
+		rv.fallback = std::string(buffer, size);
+		return rv;
+	}
+	static int push(lua_State* l, push_type s)
+	{
+		throw std::runtime_error("Lua code doesn't know how to use PossiblyKnownTagValue");
+	}
+};
+
+std::string rawId() { return osmLuaProcessing->Id(); }
+bool rawHolds(const KnownTagKey& key) { return key.found; }
+const std::string rawFind(const KnownTagKey& key) {
+	if (key.found) {
+		auto value = *(osmLuaProcessing->currentTags->getValueFromKey(key.index));
+		return std::string(value.data(), value.size());
+	}
+
+	return EMPTY_STRING;
+}
+std::vector<std::string> rawFindIntersecting(const std::string &layerName) { return osmLuaProcessing->FindIntersecting(layerName); }
+bool rawIntersects(const std::string& layerName) { return osmLuaProcessing->Intersects(layerName); }
+std::vector<std::string> rawFindCovering(const std::string& layerName) { return osmLuaProcessing->FindCovering(layerName); }
+bool rawCoveredBy(const std::string& layerName) { return osmLuaProcessing->CoveredBy(layerName); }
+bool rawIsClosed() { return osmLuaProcessing->IsClosed(); }
+double rawArea() { return osmLuaProcessing->Area(); }
+double rawLength() { return osmLuaProcessing->Length(); }
+std::vector<double> Centroid() { return osmLuaProcessing->Centroid(); }
+void rawLayer(const std::string& layerName, bool area) { return osmLuaProcessing->Layer(layerName, area); }
+void rawLayerAsCentroid(const std::string &layerName) { return osmLuaProcessing->LayerAsCentroid(layerName); }
+void rawMinZoom(const double z) { return osmLuaProcessing->MinZoom(z); }
+void rawZOrder(const double z) { return osmLuaProcessing->ZOrder(z); }
+kaguya::optional<int> rawNextRelation() { return osmLuaProcessing->NextRelation(); }
+void rawRestartRelations() { return osmLuaProcessing->RestartRelations(); }
+std::string rawFindInRelation(const std::string& key) { return osmLuaProcessing->FindInRelation(key); }
+void rawAccept() { return osmLuaProcessing->Accept(); }
+double rawAreaIntersecting(const std::string& layerName) { return osmLuaProcessing->AreaIntersecting(layerName); }
+std::vector<double> rawCentroid() { return osmLuaProcessing->Centroid(); }
+
+
 bool supportsRemappingShapefiles = false;
-const std::string EMPTY_STRING = "";
 
 int lua_error_handler(int errCode, const char *errMessage)
 {
@@ -45,31 +162,41 @@ OsmLuaProcessing::OsmLuaProcessing(
 	g_luaState = &luaState;
 	luaState.setErrorHandler(lua_error_handler);
 	luaState.dofile(luaFile.c_str());
-	luaState["OSM"].setClass(kaguya::UserdataMetatable<OsmLuaProcessing>()
-		.addFunction("Id", &OsmLuaProcessing::Id)
-		.addFunction("Holds", &OsmLuaProcessing::Holds)
-		.addFunction("Find", &OsmLuaProcessing::Find)
-		.addFunction("FindIntersecting", &OsmLuaProcessing::FindIntersecting)
-		.addFunction("Intersects", &OsmLuaProcessing::Intersects)
-		.addFunction("FindCovering", &OsmLuaProcessing::FindCovering)
-		.addFunction("CoveredBy", &OsmLuaProcessing::CoveredBy)
-		.addFunction("IsClosed", &OsmLuaProcessing::IsClosed)
-		.addFunction("Area", &OsmLuaProcessing::Area)
-		.addFunction("AreaIntersecting", &OsmLuaProcessing::AreaIntersecting)
-		.addFunction("Length", &OsmLuaProcessing::Length)
-		.addFunction("Centroid", &OsmLuaProcessing::Centroid)
-		.addFunction("Layer", &OsmLuaProcessing::Layer)
-		.addFunction("LayerAsCentroid", &OsmLuaProcessing::LayerAsCentroid)
-		.addOverloadedFunctions("Attribute", &OsmLuaProcessing::Attribute, &OsmLuaProcessing::AttributeWithMinZoom)
-		.addOverloadedFunctions("AttributeNumeric", &OsmLuaProcessing::AttributeNumeric, &OsmLuaProcessing::AttributeNumericWithMinZoom)
-		.addOverloadedFunctions("AttributeBoolean", &OsmLuaProcessing::AttributeBoolean, &OsmLuaProcessing::AttributeBooleanWithMinZoom)
-		.addFunction("MinZoom", &OsmLuaProcessing::MinZoom)
-		.addFunction("ZOrder", &OsmLuaProcessing::ZOrder)
-		.addFunction("Accept", &OsmLuaProcessing::Accept)
-		.addFunction("NextRelation", &OsmLuaProcessing::NextRelation)
-		.addFunction("RestartRelations", &OsmLuaProcessing::RestartRelations)
-		.addFunction("FindInRelation", &OsmLuaProcessing::FindInRelation)
+
+	osmLuaProcessing = this;
+	luaState["Id"] = &rawId;
+	luaState["Holds"] = &rawHolds;
+	luaState["Find"] = &rawFind;
+	luaState["FindIntersecting"] = &rawFindIntersecting;
+	luaState["Intersects"] = &rawIntersects;
+	luaState["FindCovering"] = &rawFindCovering;
+	luaState["CoveredBy"] = &rawCoveredBy;
+	luaState["IsClosed"] = &rawIsClosed;
+	luaState["Area"] = &rawArea;
+	luaState["AreaIntersecting"] = &rawAreaIntersecting;
+	luaState["Length"] = &rawLength;
+	luaState["Centroid"] = &rawCentroid;
+	luaState["Layer"] = &rawLayer;
+	luaState["LayerAsCentroid"] = &rawLayerAsCentroid;
+	luaState["Attribute"] = kaguya::overload(
+			[](const std::string &key, const PossiblyKnownTagValue& val) { osmLuaProcessing->AttributeWithMinZoom(key, val, 0); },
+			[](const std::string &key, const PossiblyKnownTagValue& val, const char minzoom) { osmLuaProcessing->AttributeWithMinZoom(key, val, minzoom); }
+	);
+	luaState["AttributeNumeric"] = kaguya::overload(
+			[](const std::string &key, const float val) { osmLuaProcessing->AttributeNumericWithMinZoom(key, val, 0); },
+			[](const std::string &key, const float val, const char minzoom) { osmLuaProcessing->AttributeNumericWithMinZoom(key, val, minzoom); }
 	);
+	luaState["AttributeBoolean"] = kaguya::overload(
+			[](const std::string &key, const bool val) { osmLuaProcessing->AttributeBooleanWithMinZoom(key, val, 0); },
+			[](const std::string &key, const bool val, const char minzoom) { osmLuaProcessing->AttributeBooleanWithMinZoom(key, val, minzoom); }
+	);
+
+	luaState["MinZoom"] = &rawMinZoom;
+	luaState["ZOrder"] = &rawZOrder;
+	luaState["Accept"] = &rawAccept;
+	luaState["NextRelation"] = &rawNextRelation;
+	luaState["RestartRelations"] = &rawRestartRelations;
+	luaState["FindInRelation"] = &rawFindInRelation;
 	supportsRemappingShapefiles = !!luaState["attribute_function"];
 	supportsReadingRelations    = !!luaState["relation_scan_function"];
 	supportsWritingRelations    = !!luaState["relation_function"];
@@ -121,18 +248,6 @@ string OsmLuaProcessing::Id() const {
 	return to_string(originalOsmID);
 }
 
-// Check if there's a value for a given key
-bool OsmLuaProcessing::Holds(const string& key) const {
-	return currentTags->find(key) != currentTags->end();
-}
-
-// Get an OSM tag for a given key (or return empty string if none)
-const string& OsmLuaProcessing::Find(const string& key) const {
-	auto it = currentTags->find(key);
-	if(it == currentTags->end()) return EMPTY_STRING;
-	return it->second;
-}
-
 // ----	Spatial queries called from Lua
 
 vector<string> OsmLuaProcessing::FindIntersecting(const string &layerName) {
@@ -327,6 +442,7 @@ const MultiPolygon &OsmLuaProcessing::multiPolygonCached() {
 
 // Add object to specified layer from Lua
 void OsmLuaProcessing::Layer(const string &layerName, bool area) {
+	outputKeys.clear();
 	if (layers.layerMap.count(layerName) == 0) {
 		throw out_of_range("ERROR: Layer(): a layer named as \"" + layerName + "\" doesn't exist.");
 	}
@@ -350,7 +466,9 @@ void OsmLuaProcessing::Layer(const string &layerName, bool area) {
 
 			if(CorrectGeometry(p) == CorrectGeometryResult::Invalid) return;
 
-			NodeID id = osmMemTiles.storePoint(p);
+			NodeID id = USE_NODE_STORE | originalOsmID;
+			if (materializeGeometries)
+				id = osmMemTiles.storePoint(p);
 			OutputObject oo(geomType, layers.layerMap[layerName], id, 0, layerMinZoom);
 			outputs.push_back(std::make_pair(std::move(oo), attributes));
 			return;
@@ -441,6 +559,7 @@ void OsmLuaProcessing::Layer(const string &layerName, bool area) {
 }
 
 void OsmLuaProcessing::LayerAsCentroid(const string &layerName) {
+	outputKeys.clear();
 	if (layers.layerMap.count(layerName) == 0) {
 		throw out_of_range("ERROR: LayerAsCentroid(): a layer named as \"" + layerName + "\" doesn't exist.");
 	}	
@@ -466,7 +585,21 @@ void OsmLuaProcessing::LayerAsCentroid(const string &layerName) {
 		return;
 	}
 
-	NodeID id = osmMemTiles.storePoint(geomp);
+	NodeID id = 0;
+	// We don't do lazy centroids for relations - calculating their centroid
+	// can be quite expensive, and there's not as many of them as there are
+	// ways.
+	if (materializeGeometries || isRelation) {
+		id = osmMemTiles.storePoint(geomp);
+	} else if (!isRelation && !isWay) {
+		// Sometimes people call LayerAsCentroid(...) on a node, because they're
+		// writing a generic handler that doesn't know if it's a node or a way,
+		// e.g. POIs.
+		id = USE_NODE_STORE | originalOsmID;
+	} else {
+		id = USE_WAY_STORE | originalOsmID;
+		wayEmitted = true;
+	}
 	OutputObject oo(POINT_, layers.layerMap[layerName], id, 0, layerMinZoom);
 	outputs.push_back(std::make_pair(std::move(oo), attributes));
 }
@@ -475,8 +608,7 @@ Point OsmLuaProcessing::calculateCentroid() {
 	Point centroid;
 	if (isRelation) {
 		Geometry tmp;
-		tmp = osmStore.wayListMultiPolygon(
-			outerWayVecPtr->cbegin(), outerWayVecPtr->cend(), innerWayVecPtr->begin(), innerWayVecPtr->cend());
+		tmp = multiPolygonCached();
 		geom::centroid(tmp, centroid);
 		return Point(centroid.x()*10000000.0, centroid.y()*10000000.0);
 	} else if (isWay) {
@@ -499,25 +631,47 @@ void OsmLuaProcessing::Accept() {
 	relationAccepted = true;
 }
 
+void OsmLuaProcessing::removeAttributeIfNeeded(const string& key) {
+	// Does it exist?
+	for (int i = 0; i < outputKeys.size(); i++) {
+		if (outputKeys[i] == key) {
+			AttributeSet& set = outputs.back().second;
+			set.removePairWithKey(attributeStore.pairStore, attributeStore.keyStore.key2index(key));
+			return;
+		}
+	}
+
+	outputKeys.push_back(key);
+}
+
 // Set attributes in a vector tile's Attributes table
-void OsmLuaProcessing::Attribute(const string &key, const string &val) { AttributeWithMinZoom(key,val,0); }
-void OsmLuaProcessing::AttributeWithMinZoom(const string &key, const string &val, const char minzoom) {
-	if (val.size()==0) { return; }		// don't set empty strings
+void OsmLuaProcessing::AttributeWithMinZoom(const string &key, const PossiblyKnownTagValue& val, const char minzoom) {
+	std::string str;
+
+	if (val.found) {
+		auto existingValue = currentTags->getValue(val.index);
+		str = std::string(existingValue->data(), existingValue->size());
+	} else {
+		str = val.fallback;
+	}
+
+	if (str.size()==0) { return; }		// don't set empty strings
 	if (outputs.size()==0) { ProcessingError("Can't add Attribute if no Layer set"); return; }
-	attributeStore.addAttribute(outputs.back().second, key, val, minzoom);
+	removeAttributeIfNeeded(key);
+	attributeStore.addAttribute(outputs.back().second, key, str, minzoom);
 	setVectorLayerMetadata(outputs.back().first.layer, key, 0);
 }
 
-void OsmLuaProcessing::AttributeNumeric(const string &key, const float val) { AttributeNumericWithMinZoom(key,val,0); }
 void OsmLuaProcessing::AttributeNumericWithMinZoom(const string &key, const float val, const char minzoom) {
 	if (outputs.size()==0) { ProcessingError("Can't add Attribute if no Layer set"); return; }
+	removeAttributeIfNeeded(key);
 	attributeStore.addAttribute(outputs.back().second, key, val, minzoom);
 	setVectorLayerMetadata(outputs.back().first.layer, key, 1);
 }
 
-void OsmLuaProcessing::AttributeBoolean(const string &key, const bool val) { AttributeBooleanWithMinZoom(key,val,0); }
 void OsmLuaProcessing::AttributeBooleanWithMinZoom(const string &key, const bool val, const char minzoom) {
 	if (outputs.size()==0) { ProcessingError("Can't add Attribute if no Layer set"); return; }
+	removeAttributeIfNeeded(key);
 	attributeStore.addAttribute(outputs.back().second, key, val, minzoom);
 	setVectorLayerMetadata(outputs.back().first.layer, key, 2);
 }
@@ -556,25 +710,27 @@ void OsmLuaProcessing::setVectorLayerMetadata(const uint_least8_t layer, const s
 
 // Scan relation (but don't write geometry)
 // return true if we want it, false if we don't
-bool OsmLuaProcessing::scanRelation(WayID id, const tag_map_t &tags) {
+bool OsmLuaProcessing::scanRelation(WayID id, const TagMap& tags) {
 	reset();
 	originalOsmID = id;
 	isWay = false;
 	isRelation = true;
 	currentTags = &tags;
 	try {
-		luaState["relation_scan_function"](this);
+		luaState["relation_scan_function"]();
 	} catch(luaProcessingException &e) {
 		std::cerr << "Lua error on scanning relation " << originalOsmID << std::endl;
 		exit(1);
 	}
 	if (!relationAccepted) return false;
 	
-	osmStore.store_relation_tags(id, tags);
+	// If we're persisting, we need to make a real map that owns its
+	// own keys and values.
+	osmStore.store_relation_tags(id, tags.exportToBoostMap());
 	return true;
 }
 
-void OsmLuaProcessing::setNode(NodeID id, LatpLon node, const tag_map_t &tags) {
+void OsmLuaProcessing::setNode(NodeID id, LatpLon node, const TagMap& tags) {
 
 	reset();
 	originalOsmID = id;
@@ -586,7 +742,7 @@ void OsmLuaProcessing::setNode(NodeID id, LatpLon node, const tag_map_t &tags) {
 
 	//Start Lua processing for node
 	try {
-		luaState["node_function"](this);
+		luaState["node_function"]();
 	} catch(luaProcessingException &e) {
 		std::cerr << "Lua error on node " << originalOsmID << std::endl;
 		exit(1);
@@ -602,7 +758,7 @@ void OsmLuaProcessing::setNode(NodeID id, LatpLon node, const tag_map_t &tags) {
 }
 
 // We are now processing a way
-bool OsmLuaProcessing::setWay(WayID wayId, LatpLonVec const &llVec, const tag_map_t &tags) {
+bool OsmLuaProcessing::setWay(WayID wayId, LatpLonVec const &llVec, const TagMap& tags) {
 	reset();
 	wayEmitted = false;
 	originalOsmID = wayId;
@@ -630,17 +786,14 @@ bool OsmLuaProcessing::setWay(WayID wayId, LatpLonVec const &llVec, const tag_ma
 
 	currentTags = &tags;
 
-	bool ok = true;
-	if (ok) {
-		//Start Lua processing for way
-		try {
-			kaguya::LuaFunction way_function = luaState["way_function"];
-			kaguya::LuaRef ret = way_function(this);
-			assert(!ret);
-		} catch(luaProcessingException &e) {
-			std::cerr << "Lua error on way " << originalOsmID << std::endl;
-			exit(1);
-		}
+	//Start Lua processing for way
+	try {
+		kaguya::LuaFunction way_function = luaState["way_function"];
+		kaguya::LuaRef ret = way_function();
+		assert(!ret);
+	} catch(luaProcessingException &e) {
+		std::cerr << "Lua error on way " << originalOsmID << std::endl;
+		exit(1);
 	}
 
 	if (!this->empty()) {
@@ -652,7 +805,7 @@ bool OsmLuaProcessing::setWay(WayID wayId, LatpLonVec const &llVec, const tag_ma
 }
 
 // We are now processing a relation
-void OsmLuaProcessing::setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const tag_map_t &tags, 
+void OsmLuaProcessing::setRelation(int64_t relationId, WayVec const &outerWayVec, WayVec const &innerWayVec, const TagMap& tags, 
                                    bool isNativeMP,      // only OSM type=multipolygon
                                    bool isInnerOuter) {  // any OSM relation with "inner" and "outer" roles (e.g. type=multipolygon|boundary)
 	reset();
@@ -669,7 +822,7 @@ void OsmLuaProcessing::setRelation(int64_t relationId, WayVec const &outerWayVec
 	// Start Lua processing for relation
 	if (!isNativeMP && !supportsWritingRelations) return;
 	try {
-		luaState[isNativeMP ? "way_function" : "relation_function"](this);
+		luaState[isNativeMP ? "way_function" : "relation_function"]();
 	} catch(luaProcessingException &e) {
 		std::cerr << "Lua error on relation " << originalOsmID << std::endl;
 		exit(1);
diff --git a/src/osm_mem_tiles.cpp b/src/osm_mem_tiles.cpp
index f5527d0e..7dc03f45 100644
--- a/src/osm_mem_tiles.cpp
+++ b/src/osm_mem_tiles.cpp
@@ -18,6 +18,30 @@ OsmMemTiles::OsmMemTiles(
 {
 }
 
+LatpLon OsmMemTiles::buildNodeGeometry(
+	NodeID const objectID,
+	const TileBbox &bbox
+) const {
+	if (objectID < OSM_THRESHOLD) {
+		return TileDataSource::buildNodeGeometry(objectID, bbox);
+	}
+
+	if (IS_NODE(objectID))
+		return nodeStore.at(OSM_ID(objectID));
+
+
+	if (IS_WAY(objectID)) {
+		Linestring& ls = getOrBuildLinestring(objectID);
+		Point centroid;
+		Polygon p;
+		geom::assign_points(p, ls);
+		geom::centroid(p, centroid);
+		return LatpLon{(int32_t)(centroid.y()*10000000.0), (int32_t)(centroid.x()*10000000.0)};
+	}
+
+	throw std::runtime_error("OsmMemTiles::buildNodeGeometry: unsupported objectID");
+}
+
 Geometry OsmMemTiles::buildWayGeometry(
 	const OutputGeometryType geomType, 
 	const NodeID objectID,
@@ -58,7 +82,7 @@ Geometry OsmMemTiles::buildWayGeometry(
 	throw std::runtime_error("buildWayGeometry: unexpected objectID: " + std::to_string(objectID));
 }
 
-void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) {
+void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) const {
 	std::vector<LatpLon> nodes = wayStore.at(OSM_ID(objectID));
 
 	for (const LatpLon& node : nodes) {
@@ -66,7 +90,7 @@ void OsmMemTiles::populateLinestring(Linestring& ls, NodeID objectID) {
 	}
 }
 
-Linestring& OsmMemTiles::getOrBuildLinestring(NodeID objectID) {
+Linestring& OsmMemTiles::getOrBuildLinestring(NodeID objectID) const {
 	// Note: this function returns a reference, not a shared_ptr.
 	//
 	// This is safe, because this function is the only thing that can
diff --git a/src/output_object.cpp b/src/output_object.cpp
index b68fb27f..7f9f0edb 100644
--- a/src/output_object.cpp
+++ b/src/output_object.cpp
@@ -87,9 +87,12 @@ void OutputObject::writeAttributes(
 int OutputObject::findValue(const vector<vector_tile::Tile_Value>* valueList, const AttributePair& value) const {
 	for (size_t i=0; i<valueList->size(); i++) {
 		const vector_tile::Tile_Value& v = valueList->at(i);
-		if (v.has_string_value() && value.hasStringValue() && v.string_value()==value.stringValue()) { return i; }
-		if (v.has_float_value()  && value.hasFloatValue()  && v.float_value() ==value.floatValue() ) { return i; }
-		if (v.has_bool_value()	 && value.hasBoolValue()   && v.bool_value()  ==value.boolValue()	) { return i; }
+		if (v.has_string_value() && value.hasStringValue()) {
+			const size_t valueSize = value.pooledString().size();
+			if (valueSize == v.string_value().size() && memcmp(v.string_value().data(), value.pooledString().data(), valueSize) == 0)
+				return i;
+		} else if (v.has_float_value()  && value.hasFloatValue()  && v.float_value() ==value.floatValue() ) { return i; }
+		else if (v.has_bool_value()	 && value.hasBoolValue()   && v.bool_value()  ==value.boolValue()	) { return i; }
 	}
 	return -1;
 }
diff --git a/src/pbf_blocks.cpp b/src/pbf_blocks.cpp
deleted file mode 100644
index e33ffca0..00000000
--- a/src/pbf_blocks.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "pbf_blocks.h"
-#include "helpers.h"
-#include <fstream>
-using namespace std;
-
-/* -------------------
-   Protobuf handling
-   ------------------- */
-
-// Read and parse a protobuf message
-void readMessage(google::protobuf::Message *message, istream &input, unsigned int size) {
-	vector<char> buffer(size);
-	input.read(&buffer.front(), size);
-	message->ParseFromArray(&buffer.front(), size);
-}
-
-// Read an osm.pbf sequence of header length -> BlobHeader -> Blob
-// and parse the unzipped contents into a message
-BlobHeader readHeader(istream &input) {
-	BlobHeader bh;
-
-	unsigned int size;
-	input.read((char*)&size, sizeof(size));
-	if (input.eof()) { return bh; }
-	endian_swap(size);
-
-	// get BlobHeader and parse
-	readMessage(&bh, input, size);
-	return bh;
-}
-
-void readBlock(google::protobuf::Message *messagePtr, std::size_t datasize, istream &input) {
-	if (input.eof()) { return ; }
-
-	// get Blob and parse
-	Blob blob;
-	readMessage(&blob, input, datasize);
-
-	// Unzip the gzipped content
-	string contents = decompress_string(blob.zlib_data(), false);
-	messagePtr->ParseFromString(contents);
-}
-
-void writeBlock(google::protobuf::Message *messagePtr, ostream &output, string headerType) {
-	// encode the message
-	string serialised;
-	messagePtr->SerializeToString(&serialised);
-	// create a blob and store it
-	Blob blob;
-	blob.set_raw_size(serialised.length());
-	blob.set_zlib_data(compress_string(serialised));
-	// encode the blob
-	string blob_encoded;
-	blob.SerializeToString(&blob_encoded);
-	
-	// create the BlobHeader
-	BlobHeader bh;
-	bh.set_type(headerType);
-	bh.set_datasize(blob_encoded.length());
-	// encode it
-	string header_encoded;
-	bh.SerializeToString(&header_encoded);
-	
-	// write out
-	unsigned int bhLength=header_encoded.length();
-	endian_swap(bhLength);
-	output.write(reinterpret_cast<const char *>(&bhLength), 4);
-	output.write(header_encoded.c_str(), header_encoded.length() );
-	output.write(blob_encoded.c_str(), blob_encoded.length() );
-}
-
-/* -------------------
-   Tag handling
-   ------------------- */
-
-// Populate an array with the contents of a StringTable
-void readStringTable(vector<string> *strPtr, PrimitiveBlock *pbPtr) {
-	strPtr->resize(pbPtr->stringtable().s_size());
-	for (int i=0; i<pbPtr->stringtable().s_size(); i++) {
-		(*strPtr)[i] = pbPtr->stringtable().s(i);			// dereference strPtr to get strings
-	}
-}
-
-// Populate a map with the reverse contents of a StringTable (i.e. string->num)
-void readStringMap(map<string, int> *mapPtr, PrimitiveBlock *pbPtr) {
-	for (int i=0; i<pbPtr->stringtable().s_size(); i++) {
-		mapPtr->insert(pair<string, int> (pbPtr->stringtable().s(i), i));
-	}
-}
-
-// Read the tags for a way into a hash
-// requires strings array to have been populated by readStringTable
-map<string, string> getTags(vector<string> *strPtr, Way *wayPtr) {
-	map<string, string> tags;
-	for (int n=0; n<wayPtr->keys_size(); n++) {
-		tags[(*strPtr)[wayPtr->keys(n)]] = (*strPtr)[wayPtr->vals(n)];
-	}
-	return tags;
-}
-
-// Find the index of a string in the StringTable, adding it if it's not there
-unsigned int findStringInTable(string *strPtr, map<string, int> *mapPtr, PrimitiveBlock *pbPtr) {
-	if (mapPtr->find(*strPtr) == mapPtr->end()) {
-		pbPtr->mutable_stringtable()->add_s(*strPtr);
-		unsigned int ix = pbPtr->stringtable().s_size()-1;
-		mapPtr->insert(pair<string, int> (*strPtr, ix));
-	}
-	return mapPtr->at(*strPtr);
-}
-
-// Set a tag for a way to a new value
-void setTag(Way *wayPtr, unsigned int keyIndex, unsigned int valueIndex) {
-	for (int i=0; i<wayPtr->keys_size(); i++) {
-		if (wayPtr->keys(i)==keyIndex) {
-			wayPtr->mutable_vals()->Set(i,valueIndex);
-			return;
-		}
-	}
-	wayPtr->mutable_keys()->Add(keyIndex);
-	wayPtr->mutable_vals()->Add(valueIndex);
-}
diff --git a/src/pbf_processor.cpp b/src/pbf_processor.cpp
new file mode 100644
index 00000000..78ddfaaf
--- /dev/null
+++ b/src/pbf_processor.cpp
@@ -0,0 +1,663 @@
+#include <iostream>
+#include "tag_map.h"
+#include "pbf_processor.h"
+#include "pbf_reader.h"
+
+#include <boost/asio/thread_pool.hpp>
+#include <boost/asio/post.hpp>
+#include <unordered_set>
+
+#include "node_store.h"
+#include "way_store.h"
+#include "osm_lua_processing.h"
+#include "mmap_allocator.h"
+
+using namespace std;
+
+const std::string OptionSortTypeThenID = "Sort.Type_then_ID";
+const std::string OptionLocationsOnWays = "LocationsOnWays";
+std::atomic<uint64_t> blocksProcessed(0), blocksToProcess(0);
+
+// Thread-local so that we can re-use buffers during parsing.
+thread_local PbfReader::PbfReader reader;
+
+PbfProcessor::PbfProcessor(OSMStore &osmStore)
+	: osmStore(osmStore)
+{ }
+
+bool PbfProcessor::ReadNodes(OsmLuaProcessing& output, PbfReader::PrimitiveGroup& pg, const PbfReader::PrimitiveBlock& pb, const unordered_set<int>& nodeKeyPositions)
+{
+	// ----	Read nodes
+	TagMap tags;
+	std::vector<NodeStore::element_t> nodes;
+
+	for (auto& node : pg.nodes()) {
+		NodeID nodeId = node.id;
+		LatpLon latplon = { int(lat2latp(double(node.lat)/10000000.0)*10000000.0), node.lon };
+
+		bool significant = false;
+		for (int i = node.tagStart; i < node.tagEnd; i += 2) {
+			auto keyIndex = pg.translateNodeKeyValue(i);
+
+			if (nodeKeyPositions.find(keyIndex) != nodeKeyPositions.end()) {
+				significant = true;
+			}
+		}
+
+		nodes.push_back(std::make_pair(static_cast<NodeID>(nodeId), latplon));
+
+		if (significant) {
+			// For tagged nodes, call Lua, then save the OutputObject
+			tags.reset();
+
+			for (int n = node.tagStart; n < node.tagEnd; n += 2) {
+				auto keyIndex = pg.translateNodeKeyValue(n);
+				auto valueIndex = pg.translateNodeKeyValue(n + 1);
+
+				const protozero::data_view& key = pb.stringTable[keyIndex];
+				const protozero::data_view& value = pb.stringTable[valueIndex];
+				tags.addTag(key, value);
+			}
+			output.setNode(static_cast<NodeID>(nodeId), latplon, tags);
+		} 
+	}
+
+	if (nodes.size() > 0) {
+		osmStore.nodes.insert(nodes);
+	}
+
+	return !pg.nodes().empty();
+}
+
+bool PbfProcessor::ReadWays(
+	OsmLuaProcessing &output,
+	PbfReader::PrimitiveGroup& pg,
+	const PbfReader::PrimitiveBlock& pb,
+	bool locationsOnWays,
+	uint shard,
+	uint effectiveShards
+) {
+	// ----	Read ways
+	if (pg.ways().empty())
+		return false;
+
+	TagMap tags;
+
+	const bool wayStoreRequiresNodes = osmStore.ways.requiresNodes();
+
+	std::vector<WayStore::ll_element_t> llWays;
+	std::vector<std::pair<WayID, std::vector<NodeID>>> nodeWays;
+	LatpLonVec llVec;
+	std::vector<NodeID> nodeVec;
+
+	for (PbfReader::Way pbfWay : pg.ways()) {
+		llVec.clear();
+		nodeVec.clear();
+
+		WayID wayId = static_cast<WayID>(pbfWay.id);
+		if (wayId >= pow(2,42)) throw std::runtime_error("Way ID negative or too large: "+std::to_string(wayId));
+
+		// Assemble nodelist
+		if (locationsOnWays) {
+			llVec.reserve(pbfWay.lats.size());
+			for (int k=0; k<pbfWay.lats.size(); k++) {
+				int lat = pbfWay.lats[k];
+				int lon = pbfWay.lons[k];
+				LatpLon ll = { int(lat2latp(double(lat)/10000000.0)*10000000.0), lon };
+				llVec.push_back(ll);
+			}
+		} else {
+			llVec.reserve(pbfWay.refs.size());
+			nodeVec.reserve(pbfWay.refs.size());
+
+			bool skipToNext = false;
+
+			for (int k=0; k<pbfWay.refs.size(); k++) {
+				NodeID nodeId = pbfWay.refs[k];
+
+				if (k == 0 && effectiveShards > 1 && !osmStore.nodes.contains(shard, nodeId)) {
+					skipToNext = true;
+					break;
+				}
+
+				try {
+					llVec.push_back(osmStore.nodes.at(static_cast<NodeID>(nodeId)));
+					nodeVec.push_back(nodeId);
+				} catch (std::out_of_range &err) {
+					if (osmStore.integrity_enforced()) throw err;
+				}
+			}
+
+			if (skipToNext)
+				continue;
+		}
+		if (llVec.empty()) continue;
+
+		try {
+			tags.reset();
+			readTags(pbfWay, pb, tags);
+			bool emitted = output.setWay(static_cast<WayID>(pbfWay.id), llVec, tags);
+
+			// If we need it for later, store the way's coordinates in the global way store
+			if (emitted || osmStore.way_is_used(wayId)) {
+				if (wayStoreRequiresNodes)
+					nodeWays.push_back(std::make_pair(wayId, nodeVec));
+				else
+					llWays.push_back(std::make_pair(wayId, WayStore::latplon_vector_t(llVec.begin(), llVec.end())));
+			}
+
+		} catch (std::out_of_range &err) {
+			// Way is missing a node?
+			cerr << endl << err.what() << endl;
+		}
+
+	}
+
+	if (wayStoreRequiresNodes) {
+		osmStore.ways.shard(shard).insertNodes(nodeWays);
+	} else {
+		osmStore.ways.shard(shard).insertLatpLons(llWays);
+	}
+
+	return true;
+}
+
+bool PbfProcessor::ScanRelations(OsmLuaProcessing& output, PbfReader::PrimitiveGroup& pg, const PbfReader::PrimitiveBlock& pb) {
+	// Scan relations to see which ways we need to save
+	if (pg.relations().empty())
+		return false;
+
+	int typeKey = findStringPosition(pb, "type");
+	int mpKey   = findStringPosition(pb, "multipolygon");
+
+	TagMap tags;
+	for (PbfReader::Relation pbfRelation : pg.relations()) {
+		bool isMultiPolygon = relationIsType(pbfRelation, typeKey, mpKey);
+		bool isAccepted = false;
+		WayID relid = static_cast<WayID>(pbfRelation.id);
+		if (!isMultiPolygon) {
+			if (output.canReadRelations()) {
+				tags.reset();
+				readTags(pbfRelation, pb, tags);
+				isAccepted = output.scanRelation(relid, tags);
+			}
+			if (!isAccepted) continue;
+		}
+		for (int n=0; n < pbfRelation.memids.size(); n++) {
+			uint64_t lastID = pbfRelation.memids[n];
+			if (pbfRelation.types[n] != PbfReader::Relation::MemberType::WAY) { continue; }
+			if (lastID >= pow(2,42)) throw std::runtime_error("Way ID in relation "+std::to_string(relid)+" negative or too large: "+std::to_string(lastID));
+			osmStore.mark_way_used(static_cast<WayID>(lastID));
+			if (isAccepted) { osmStore.relation_contains_way(relid, lastID); }
+		}
+	}
+	return true;
+}
+
+bool PbfProcessor::ReadRelations(
+	OsmLuaProcessing& output,
+	PbfReader::PrimitiveGroup& pg,
+	const PbfReader::PrimitiveBlock& pb,
+	const BlockMetadata& blockMetadata,
+	uint shard,
+	uint effectiveShards
+) {
+	// ----	Read relations
+	if (pg.relations().empty())
+		return false;
+
+	TagMap tags;
+
+	std::vector<RelationStore::element_t> relations;
+
+	int typeKey = findStringPosition(pb, "type");
+	int mpKey   = findStringPosition(pb, "multipolygon");
+	int boundaryKey = findStringPosition(pb, "boundary");
+	int innerKey= findStringPosition(pb, "inner");
+	int outerKey= findStringPosition(pb, "outer");
+	if (typeKey >-1 && mpKey>-1) {
+		int j = -1;
+		for (PbfReader::Relation pbfRelation : pg.relations()) {
+			j++;
+			if (j % blockMetadata.chunks != blockMetadata.chunk)
+				continue;
+
+			bool isMultiPolygon = relationIsType(pbfRelation, typeKey, mpKey);
+			bool isBoundary = relationIsType(pbfRelation, typeKey, boundaryKey);
+			if (!isMultiPolygon && !isBoundary && !output.canWriteRelations()) continue;
+
+			// Read relation members
+			WayVec outerWayVec, innerWayVec;
+			bool isInnerOuter = isBoundary || isMultiPolygon;
+			bool skipToNext = false;
+			bool firstWay = true;
+			for (int n = 0; n < pbfRelation.memids.size(); n++) {
+				uint64_t lastID = pbfRelation.memids[n];
+				if (pbfRelation.types[n] != PbfReader::Relation::MemberType::WAY) { continue; }
+				int32_t role = pbfRelation.roles_sid[n];
+				if (role==innerKey || role==outerKey) isInnerOuter=true;
+				WayID wayId = static_cast<WayID>(lastID);
+
+				if (firstWay && effectiveShards > 1 && !osmStore.ways.contains(shard, wayId)) {
+					skipToNext = true;
+					break;
+				}
+				if (firstWay)
+					firstWay = false;
+				(role == innerKey ? innerWayVec : outerWayVec).push_back(wayId);
+			}
+
+			if (skipToNext)
+				continue;
+
+			try {
+				tags.reset();
+				readTags(pbfRelation, pb, tags);
+				output.setRelation(pbfRelation.id, outerWayVec, innerWayVec, tags, isMultiPolygon, isInnerOuter);
+
+			} catch (std::out_of_range &err) {
+				// Relation is missing a member?
+				cerr << endl << err.what() << endl;
+			}
+		}
+	}
+
+	osmStore.relations_insert_front(relations);
+	return true;
+}
+
+// Returns true when block was completely handled, thus could be omited by another phases.
+bool PbfProcessor::ReadBlock(
+	std::istream& infile,
+	OsmLuaProcessing& output,
+	const BlockMetadata& blockMetadata,
+	const unordered_set<string>& nodeKeys,
+	bool locationsOnWays,
+	ReadPhase phase,
+	uint shard,
+	uint effectiveShards
+) 
+{
+	infile.seekg(blockMetadata.offset);
+
+	protozero::data_view blob = reader.readBlob(blockMetadata.length, infile);
+	PbfReader::PrimitiveBlock& pb = reader.readPrimitiveBlock(blob);
+	if (infile.eof()) {
+		return true;
+	}
+
+	// Keep count of groups read during this phase.
+	std::size_t read_groups = 0;
+
+	// Read the string table, and pre-calculate the positions of valid node keys
+	unordered_set<int> nodeKeyPositions;
+	for (auto it : nodeKeys) {
+		//nodeKeyPositions.insert(findStringPosition(pb, it));
+		auto rv = findStringPosition(pb, it);
+		nodeKeyPositions.insert(rv);
+	}
+
+	int primitiveGroupSize = 0;
+	for (auto& pg : pb.groups()) {
+		primitiveGroupSize++;
+	
+		auto output_progress = [&]()
+		{
+			if (ioMutex.try_lock()) {
+				std::ostringstream str;
+				str << "\r";
+				void_mmap_allocator::reportStoreSize(str);
+				if (effectiveShards > 1)
+					str << std::to_string(shard + 1) << "/" << std::to_string(effectiveShards) << " ";
+
+				// TODO: revive showing the # of ways/relations?
+				str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ";
+				std::cout << str.str();
+				std::cout.flush();
+				ioMutex.unlock();
+			}
+		};
+
+		if(phase == ReadPhase::Nodes) {
+			bool done = ReadNodes(output, pg, pb, nodeKeyPositions);
+			if(done) { 
+				output_progress();
+				++read_groups;
+				continue;
+			}
+		}
+
+		if(phase == ReadPhase::RelationScan) {
+			osmStore.ensureUsedWaysInited();
+			bool done = ScanRelations(output, pg, pb);
+			if(done) { 
+				if (ioMutex.try_lock()) {
+					std::cout << "\r(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)           ";
+					std::cout.flush();
+					ioMutex.unlock();
+				}
+				continue;
+			}
+		}
+	
+		if(phase == ReadPhase::Ways) {
+			bool done = ReadWays(output, pg, pb, locationsOnWays, shard, effectiveShards);
+			if(done) { 
+				output_progress();
+				++read_groups;
+				continue;
+			}
+		}
+
+		if(phase == ReadPhase::Relations) {
+			bool done = ReadRelations(output, pg, pb, blockMetadata, shard, effectiveShards);
+			if(done) { 
+				output_progress();
+				++read_groups;
+				continue;
+			}
+		}
+	}
+
+	// Possible cases of a block contents:
+	// - single group
+	// - multiple groups of the same type
+	// - multiple groups of the different type
+	// 
+	// In later case block would not be handled during this phase, and should be
+	// read again in remaining phases. Thus we return false to indicate that the
+	// block was not handled completelly.
+	if(read_groups != primitiveGroupSize) {
+		return false;
+	}
+
+	// We can only delete blocks if we're confident we've processed everything,
+	// which is not possible in the case of subdivided blocks.
+	return (shard + 1 == effectiveShards) && blockMetadata.chunks == 1;
+}
+
+bool blockHasPrimitiveGroupSatisfying(
+	std::istream& infile,
+	const BlockMetadata block,
+	std::function<bool(const PbfReader::PrimitiveGroup&)> test
+) {
+	// We may have previously read to EOF, so clear the internal error state
+	infile.clear();
+	infile.seekg(block.offset);
+	protozero::data_view blob = reader.readBlob(block.length, infile);
+	PbfReader::PrimitiveBlock pb = reader.readPrimitiveBlock(blob);
+
+	if (infile.eof()) {
+		throw std::runtime_error("blockHasPrimitiveGroupSatisfying got unexpected eof");
+	}
+
+	for (auto& pg : pb.groups()) {
+		if (test(pg))
+			return false;
+	}
+	
+	return true;
+}
+
+int PbfProcessor::ReadPbfFile(
+	uint shards,
+	bool hasSortTypeThenID,
+	unordered_set<string> const& nodeKeys,
+	unsigned int threadNum,
+	const pbfreader_generate_stream& generate_stream,
+	const pbfreader_generate_output& generate_output,
+	const NodeStore& nodeStore,
+	const WayStore& wayStore
+)
+{
+	auto infile = generate_stream();
+
+	// ----	Read PBF
+	osmStore.clear();
+
+	PbfReader::HeaderBlock block = reader.readHeaderFromFile(*infile);
+	bool locationsOnWays = block.optionalFeatures.find(OptionLocationsOnWays) != block.optionalFeatures.end();
+	if (locationsOnWays) {
+		std::cout << ".osm.pbf file has locations on ways" << std::endl;
+	}
+
+	std::map<std::size_t, BlockMetadata> blocks;
+
+	// Track the filesize - note that we can't rely on tellg(), as
+	// its meant to be an opaque token useful only for seeking.
+	size_t filesize = 0;
+	while (true) {
+		PbfReader::BlobHeader bh = reader.readBlobHeader(*infile);
+		filesize += bh.datasize;
+		if (infile->eof()) {
+			break;
+		}
+
+		blocks[blocks.size()] = { (long int)infile->tellg(), bh.datasize, true, true, true, 0, 1 };
+		infile->seekg(bh.datasize, std::ios_base::cur);
+	}
+
+	if (hasSortTypeThenID) {
+		// The PBF's blocks are sorted by type, then ID. We can do a binary search
+		// to learn where the blocks transition between object types, which
+		// enables a more efficient partitioning of work for reading.
+		std::vector<size_t> indexes;
+		for (int i = 0; i < blocks.size(); i++)
+			indexes.push_back(i);
+
+		const auto& waysStart = std::lower_bound(
+			indexes.begin(),
+			indexes.end(),
+			0,
+			[&blocks, &infile](const auto &i, const auto &ignored) {
+				return blockHasPrimitiveGroupSatisfying(
+					*infile,
+					blocks[i],
+					[](const PbfReader::PrimitiveGroup& pg) {
+						for(auto w : pg.ways()) return true;
+						for(auto r : pg.relations()) return true;
+						return false;
+					}
+				);
+			}
+		);
+
+		const auto& relationsStart = std::lower_bound(
+			indexes.begin(),
+			indexes.end(),
+			0,
+			[&blocks, &infile](const auto &i, const auto &ignored) {
+				return blockHasPrimitiveGroupSatisfying(
+					*infile,
+					blocks[i],
+					[](const PbfReader::PrimitiveGroup& pg) {
+						for (auto r : pg.relations()) return true;
+						return false;
+					}
+				);
+			}
+		);
+
+		for (auto it = indexes.begin(); it != indexes.end(); it++) {
+			blocks[*it].hasNodes = it <= waysStart;
+			blocks[*it].hasWays = it >= waysStart && it <= relationsStart;
+			blocks[*it].hasRelations = it >= relationsStart;
+		}
+	}
+
+
+	// PBFs generated by Osmium have 8,000 entities per block,
+	// and each block is about 64KB.
+	//
+	// PBFs generated by osmconvert (e.g., BBBike PBFs) have as
+	// many entities as fit in 31MB. Each block is about 16MB.
+	//
+	// Osmium PBFs seem to be processed about 3x faster than osmconvert
+	// PBFs, so try to hint to the user when they could speed up their
+	// pipeline.
+	if (filesize / blocks.size() > 1000000) {
+		std::cout << "warning: PBF has very large blocks, which may slow processing" << std::endl;
+		std::cout << "         to fix: osmium cat -f pbf your-file.osm.pbf -o optimized.osm.pbf" << std::endl;
+	}
+
+
+	std::vector<ReadPhase> all_phases = { ReadPhase::Nodes, ReadPhase::RelationScan, ReadPhase::Ways, ReadPhase::Relations };
+	for(auto phase: all_phases) {
+		uint effectiveShards = 1;
+
+		// On memory-constrained machines, we might read ways/relations
+		// multiple times in order to keep the working set of nodes limited.
+		if (phase == ReadPhase::Ways || phase == ReadPhase::Relations)
+			effectiveShards = shards;
+
+		for (int shard = 0; shard < effectiveShards; shard++) {
+			// If we're in ReadPhase::Ways, only do a pass if there is at least one
+			// entry in the pass's shard.
+			if (phase == ReadPhase::Ways && nodeStore.shard(shard).size() == 0)
+				continue;
+
+			// Ditto, but for relations
+			if (phase == ReadPhase::Relations && wayStore.shard(shard).size() == 0)
+				continue;
+
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+
+			// Launch the pool with threadNum threads
+			boost::asio::thread_pool pool(threadNum);
+			std::mutex block_mutex;
+
+			// If we're in ReadPhase::Relations and there aren't many blocks left
+			// to read, increase parallelism by letting each thread only process
+			// a portion of the block.
+			if (phase == ReadPhase::Relations && blocks.size() < threadNum * 2) {
+				std::cout << "only " << blocks.size() << " relation blocks; subdividing for better parallelism" << std::endl;
+				std::map<std::size_t, BlockMetadata> moreBlocks;
+				for (const auto& block : blocks) {
+					BlockMetadata newBlock = block.second;
+					newBlock.chunks = threadNum;
+					for (size_t i = 0; i < threadNum; i++) {
+						newBlock.chunk = i;
+						moreBlocks[moreBlocks.size()] = newBlock;
+					}
+				}
+				blocks = moreBlocks;
+			}
+
+			std::deque<std::vector<IndexedBlockMetadata>> blockRanges;
+			std::map<std::size_t, BlockMetadata> filteredBlocks;
+			for (const auto& entry : blocks) {
+				if ((phase == ReadPhase::Nodes && entry.second.hasNodes) ||
+						(phase == ReadPhase::RelationScan && entry.second.hasRelations) ||
+						(phase == ReadPhase::Ways && entry.second.hasWays) ||
+						(phase == ReadPhase::Relations && entry.second.hasRelations))
+					filteredBlocks[entry.first] = entry.second;
+			}
+
+			blocksToProcess = filteredBlocks.size();
+			blocksProcessed = 0;
+
+			// Relations have very non-uniform processing times, so prefer
+			// to process them as granularly as possible.
+			size_t batchSize = 1;
+
+			// When creating NodeStore/WayStore, we try to give each worker
+			// large batches of contiguous blocks, so that they might benefit from
+			// long runs of sorted indexes, and locality of nearby IDs.
+			if (phase == ReadPhase::Nodes || phase == ReadPhase::Ways)
+				batchSize = (filteredBlocks.size() / (threadNum * 8)) + 1;
+
+			size_t consumed = 0;
+			auto it = filteredBlocks.begin();
+			while(it != filteredBlocks.end()) {
+				std::vector<IndexedBlockMetadata> blockRange;
+				blockRange.reserve(batchSize);
+				size_t max = consumed + batchSize;
+				for (; consumed < max && it != filteredBlocks.end(); consumed++) {
+					IndexedBlockMetadata ibm;
+					memcpy(&ibm, &it->second, sizeof(BlockMetadata));
+					ibm.index = it->first;
+					blockRange.push_back(ibm);
+					it++;
+				}
+				blockRanges.push_back(blockRange);
+			}
+
+			{
+				for(const std::vector<IndexedBlockMetadata>& blockRange: blockRanges) {
+					boost::asio::post(pool, [=, &blockRange, &blocks, &block_mutex, &nodeKeys]() {
+						if (phase == ReadPhase::Nodes)
+							osmStore.nodes.batchStart();
+						if (phase == ReadPhase::Ways)
+							osmStore.ways.batchStart();
+
+						for (const IndexedBlockMetadata& indexedBlockMetadata: blockRange) {
+							auto infile = generate_stream();
+							auto output = generate_output();
+
+							if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase, shard, effectiveShards)) {
+								const std::lock_guard<std::mutex> lock(block_mutex);
+								blocks.erase(indexedBlockMetadata.index);	
+							}
+							blocksProcessed++;
+						}
+					});
+				}
+			}
+		
+			pool.join();
+
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t elapsedNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << "(" << std::to_string((uint32_t)(elapsedNs / 1e6)) << " ms)" << std::endl;
+#endif
+		}
+
+		if(phase == ReadPhase::Nodes) {
+			osmStore.nodes.finalize(threadNum);
+		}
+		if(phase == ReadPhase::Ways) {
+			osmStore.ways.finalize(threadNum);
+		}
+	}
+	return 0;
+}
+
+// Find a string in the dictionary
+int PbfProcessor::findStringPosition(const PbfReader::PrimitiveBlock& pb, const std::string& str) {
+	for (int i = 0; i < pb.stringTable.size(); i++) {
+		if(str.size() == pb.stringTable[i].size() && memcmp(str.data(), pb.stringTable[i].data(), str.size()) == 0)
+			return i;
+	}
+	return -1;
+}
+
+
+// *************************************************
+
+int ReadPbfBoundingBox(const std::string &inputFile, double &minLon, double &maxLon, 
+	double &minLat, double &maxLat, bool &hasClippingBox)
+{
+	fstream infile(inputFile, ios::in | ios::binary);
+	if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; }
+	auto header = reader.readHeaderFromFile(infile);
+	if (header.hasBbox) {
+		hasClippingBox = true;
+		minLon = header.bbox.minLon;
+		maxLon = header.bbox.maxLon;
+		minLat = header.bbox.minLat;
+		maxLat = header.bbox.maxLat;
+	}
+	infile.close();
+	return 0;
+}
+
+bool PbfHasOptionalFeature(const std::string& inputFile, const std::string& feature) {
+	std::ifstream infile(inputFile, std::ifstream::in);
+	auto header = reader.readHeaderFromFile(infile);
+	infile.close();
+	return header.optionalFeatures.find(feature) != header.optionalFeatures.end();
+}
diff --git a/src/pbf_reader.cpp b/src/pbf_reader.cpp
new file mode 100644
index 00000000..ed400a49
--- /dev/null
+++ b/src/pbf_reader.cpp
@@ -0,0 +1,590 @@
+#include <protozero/pbf_message.hpp>
+#include <iostream>
+#include <vector>
+#include "pbf_reader.h"
+#include "helpers.h"
+
+// Where pbf_processor.cpp has higher-level routines that populate our structures,
+// pbf_reader.cpp has low-level tools that interact with the protobuf.
+//
+// The lifetime of an object is only until someone calls a readXyz function at
+// the same or higher level.
+//   - e.g. readPrimitiveGroup invalidates the result of a prior readPrimitiveGroup call,
+//          but not the result of a prior readBlob call
+//
+// This allows us to re-use buffers to minimize heap churn and allocation cost.
+//
+// If you want to persist the data beyond that, you must make a copy in memory
+// that you own.
+
+PbfReader::BlobHeader PbfReader::PbfReader::readBlobHeader(std::istream& input) {
+	// See https://wiki.openstreetmap.org/wiki/PBF_Format#File_format
+	unsigned int size;
+	input.read((char*)&size, sizeof(size));
+	if (input.eof()) {
+		return {"eof", -1};
+	}
+
+	endian_swap(size);
+	std::vector<char> data;
+	data.resize(size);
+	input.read(&data[0], size);
+
+	if (input.eof())
+		throw std::runtime_error("readBlobHeader: unexpected eof");
+
+	protozero::pbf_message<Schema::BlobHeader> message{&data[0], data.size()};
+
+	std::string type;
+	int32_t datasize = -1;
+
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::BlobHeader::required_string_type:
+				type = message.get_string();
+				break;
+			case Schema::BlobHeader::required_int32_datasize:
+				datasize = message.get_int32();
+				break;
+			default:
+				// ignore data for unknown tags to allow for future extensions
+				// std::cout << "BlobHeader: unknown tag: " << std::to_string(static_cast<uint32_t>(message.tag())) << std::endl;
+				message.skip();
+		}
+	}
+
+	if (type.empty())
+		throw std::runtime_error("BlobHeader type is missing");
+
+	if (datasize == -1)
+		throw std::runtime_error("BlobHeader datasize is missing");
+
+	return { type, datasize };
+}
+
+protozero::data_view PbfReader::PbfReader::readBlob(int32_t datasize, std::istream& input) {
+	blobStorage.resize(datasize);
+	input.read(&blobStorage[0], datasize);
+	if (input.eof())
+		throw std::runtime_error("readBlob: unexpected eof");
+
+	int32_t rawSize = -1;
+	protozero::data_view view;
+	protozero::pbf_message<Schema::Blob> message{&blobStorage[0], blobStorage.size()};
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::Blob::optional_int32_raw_size:
+				rawSize = message.get_int32();
+				break;
+			case Schema::Blob::oneof_data_bytes_raw:
+				view = message.get_view();
+				break;
+			case Schema::Blob::oneof_data_bytes_zlib_data:
+				view = message.get_view();
+				break;
+			default:
+				throw std::runtime_error("Blob: unknown tag: " + std::to_string(static_cast<uint32_t>(message.tag())));
+		}
+	}
+
+	if (rawSize == -1)
+		// Data is not compressed, can return it directly.
+		return view;
+
+	blobStorage2.resize(rawSize);
+	decompress_string(blobStorage2, view.data(), view.size(), false);
+	return { &blobStorage2[0], blobStorage2.size() };
+}
+
+PbfReader::HeaderBBox PbfReader::PbfReader::readHeaderBBox(protozero::data_view data) {
+	HeaderBBox box{0, 0, 0, 0};
+
+	protozero::pbf_message<Schema::HeaderBBox> message{data};
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::HeaderBBox::required_sint64_left:
+				box.minLon = message.get_sint64() / 1000000000.0;
+				break;
+			case Schema::HeaderBBox::required_sint64_right:
+				box.maxLon = message.get_sint64() / 1000000000.0;
+				break;
+			case Schema::HeaderBBox::required_sint64_bottom:
+				box.minLat = message.get_sint64() / 1000000000.0;
+				break;
+			case Schema::HeaderBBox::required_sint64_top:
+				box.maxLat = message.get_sint64() / 1000000000.0;
+				break;
+			default:
+				throw std::runtime_error("HeaderBBox: unknown tag: " + std::to_string(static_cast<uint32_t>(message.tag())));
+		}
+	}
+
+	return box;
+}
+
+PbfReader::HeaderBlock PbfReader::PbfReader::readHeaderBlock(protozero::data_view data) {
+	HeaderBlock block{false};
+
+	protozero::pbf_message<Schema::HeaderBlock> message{data};
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::HeaderBlock::optional_HeaderBBox_bbox:
+				block.hasBbox = true;
+				block.bbox = PbfReader::readHeaderBBox(message.get_view());
+				break;
+			case Schema::HeaderBlock::repeated_string_optional_features: {
+				const auto feature = message.get_string();
+				block.optionalFeatures.insert(feature);
+				break;
+			}
+			default:
+				// ignore data for unknown tags to allow for future extensions
+				//std::cout << "HeaderBlock: unknown tag: " << std::to_string(static_cast<uint32_t>(message.tag())) << std::endl;
+				message.skip();
+		}
+	}
+
+	return block;
+}
+
+void PbfReader::PbfReader::readStringTable(protozero::data_view data, std::vector<protozero::data_view>& stringTable) {
+	protozero::pbf_message<Schema::StringTable> message{data};
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::StringTable::repeated_bytes_s:
+				stringTable.push_back(message.get_view());
+				break;
+			default:
+				throw std::runtime_error("StringTable: unknown tag: " + std::to_string(static_cast<uint32_t>(message.tag())));
+		}
+	}
+}
+
+PbfReader::PrimitiveBlock& PbfReader::PbfReader::readPrimitiveBlock(protozero::data_view data) {
+	pb.stringTable.clear();
+	pb.internalGroups.clear();
+
+	protozero::pbf_message<Schema::PrimitiveBlock> message{data};
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::PrimitiveBlock::required_StringTable_stringtable:
+				// Most of our use cases require the string table, so we eagerly
+				// initialize it.
+				PbfReader::readStringTable(message.get_view(), pb.stringTable);
+				break;
+			case Schema::PrimitiveBlock::repeated_PrimitiveGroup_primitivegroup: {
+				pb.internalGroups.push_back(PrimitiveGroup(
+					message.get_view(),
+					denseNodes,
+					way,
+					relation
+				));
+				break;
+			}
+			default:
+				// ignore data for unknown tags to allow for future extensions
+				//std::cout << "HeaderBlock: unknown tag: " << std::to_string(static_cast<uint32_t>(message.tag())) << std::endl;
+				message.skip();
+		}
+	}
+
+	pb.groupsImpl = PrimitiveBlock::PrimitiveGroups(pb.internalGroups);
+
+	return pb;
+}
+
+void PbfReader::DenseNodes::readDenseNodes(protozero::data_view data) {
+	protozero::pbf_message<Schema::DenseNodes> message{data};
+
+	uint64_t id = 0;
+	int32_t lon = 0, lat = 0;
+	
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::DenseNodes::repeated_sint64_id: {
+				auto pi = message.get_packed_sint64();
+				for (auto i : pi) {
+					id += i;
+					ids.push_back(id);
+				}
+				break;
+			} case Schema::DenseNodes::repeated_sint64_lat: {
+				auto pi = message.get_packed_sint64();
+				for (auto i : pi) {
+					lat += i;
+					lats.push_back(lat);
+				}
+				break;
+			}
+			case Schema::DenseNodes::repeated_sint64_lon: {
+				auto pi = message.get_packed_sint64();
+				for (auto i : pi) {
+					lon += i;
+					lons.push_back(lon);
+				}
+				break;
+			}
+			case Schema::DenseNodes::repeated_int32_keys_vals: {
+				auto pi = message.get_packed_int32();
+				for (auto kv : pi) {
+					keyValues.push_back(kv);
+				}
+				break;
+			}
+
+			default:
+				// ignore data for unknown tags to allow for future extensions
+				//std::cout << "HeaderBlock: unknown tag: " << std::to_string(static_cast<uint32_t>(message.tag())) << std::endl;
+				message.skip();
+		}
+	}
+
+	for (uint32_t cur = 0, prev = 0; cur < keyValues.size(); cur++) {
+		if (keyValues[cur] == 0) {
+			tagStart.push_back(prev);
+			tagEnd.push_back(cur);
+			prev = cur + 1;
+		}
+	}
+
+	while(tagStart.size() < ids.size()) {
+		tagStart.push_back(0);
+		tagEnd.push_back(0);
+	}
+}
+
+PbfReader::PrimitiveGroup::PrimitiveGroup(
+	protozero::data_view data,
+	DenseNodes& denseNodes,
+	Way& way,
+	Relation& relation
+):
+	data(data),
+	denseNodes(denseNodes),
+	internalWays({this, way}),
+	internalRelations({this, relation}),
+	denseNodesInitialized(false) {
+}
+
+int32_t PbfReader::PrimitiveGroup::translateNodeKeyValue(int32_t i) const {
+	return denseNodes.keyValues.at(i);
+}
+
+protozero::data_view PbfReader::PrimitiveGroup::getDataView() {
+	return data;
+}
+
+void PbfReader::PrimitiveGroup::ensureData() {
+	// Reset our thread locals.
+	denseNodes.clear();
+	internalWays.pg = this;
+	internalRelations.pg = this;
+
+	protozero::pbf_message<Schema::PrimitiveGroup> message{data};
+	if (message.next()) {
+		switch (message.tag()) {
+			case Schema::PrimitiveGroup::repeated_Node_nodes:
+				throw std::runtime_error("PrimitiveGroup: non-dense Nodes are not supported");
+				break;
+			case Schema::PrimitiveGroup::optional_DenseNodes_dense:
+				internalType = PrimitiveGroupType::DenseNodes;
+				denseNodes.readDenseNodes(message.get_view());
+				break;
+			case Schema::PrimitiveGroup::repeated_Way_ways:
+				internalType = PrimitiveGroupType::Way;
+				break;
+			case Schema::PrimitiveGroup::repeated_Relation_relations:
+				internalType = PrimitiveGroupType::Relation;
+				break;
+			case Schema::PrimitiveGroup::repeated_ChangeSet_changesets:
+				internalType = PrimitiveGroupType::ChangeSet;
+				break;
+			default:
+				throw std::runtime_error("PrimitiveGroup: unknown tag: " + std::to_string(static_cast<uint32_t>(message.tag())));
+		}
+	}
+}
+
+PbfReader::DenseNodes& PbfReader::PrimitiveGroup::nodes() const { return denseNodes; };
+PbfReader::PrimitiveBlock::PrimitiveGroups& PbfReader::PrimitiveBlock::groups() { return groupsImpl; };
+
+void PbfReader::DenseNodes::clear() {
+	ids.clear();
+	lons.clear();
+	lats.clear();
+	tagStart.clear();
+	tagEnd.clear();
+	keyValues.clear();
+}
+
+bool PbfReader::DenseNodes::Iterator::operator!=(Iterator& other) const {
+	return offset != other.offset;
+}
+
+void PbfReader::DenseNodes::Iterator::operator++() {
+	offset++;
+
+	if (offset < nodes.ids.size()) {
+		node.id = nodes.ids[offset];
+		node.lon = nodes.lons[offset];
+		node.lat = nodes.lats[offset];
+		node.tagStart = nodes.tagStart[offset];
+		node.tagEnd = nodes.tagEnd[offset];
+	}
+}
+
+PbfReader::DenseNodes::Node& PbfReader::DenseNodes::Iterator::operator*() {
+	return node;
+}
+
+bool PbfReader::DenseNodes::empty() {
+	return ids.empty();
+}
+
+PbfReader::DenseNodes::Iterator PbfReader::DenseNodes::begin() {
+	auto it = Iterator {-1, Node{}, *this};
+	++it;
+	return it;
+}
+
+PbfReader::DenseNodes::Iterator PbfReader::DenseNodes::end() {
+	return Iterator {static_cast<int32_t>(ids.size()), Node{}, *this};
+}
+
+bool PbfReader::PrimitiveBlock::PrimitiveGroups::Iterator::operator!=(Iterator& other) const {
+	return offset != other.offset;
+}
+void PbfReader::PrimitiveBlock::PrimitiveGroups::Iterator::operator++() {
+	offset++;
+
+	if (offset < groups->size()) {
+		(*groups)[offset].ensureData();
+	}
+}
+PbfReader::PrimitiveGroup& PbfReader::PrimitiveBlock::PrimitiveGroups::Iterator::operator*() {
+	return (*groups)[offset];
+}
+PbfReader::PrimitiveBlock::PrimitiveGroups::Iterator PbfReader::PrimitiveBlock::PrimitiveGroups::begin() {
+	auto it = PrimitiveBlock::PrimitiveGroups::Iterator {-1, *groups };
+	++it;
+	return it;
+}
+PbfReader::PrimitiveBlock::PrimitiveGroups::Iterator PbfReader::PrimitiveBlock::PrimitiveGroups::end() {
+	return PrimitiveBlock::PrimitiveGroups::Iterator {static_cast<int32_t>(groups->size()), *groups };
+}
+
+PbfReader::PrimitiveGroupType PbfReader::PrimitiveGroup::type() const {
+	return internalType;
+}
+
+void PbfReader::Ways::Iterator::readWay(protozero::data_view data) {
+	protozero::pbf_message<Schema::Way> message{data};
+
+	way.id = 0;
+	way.keys.clear();
+	way.vals.clear();
+	way.refs.clear();
+	way.lats.clear();
+	way.lons.clear();
+
+	uint64_t ref = 0;
+	uint32_t lat = 0, lon = 0;
+	
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::Way::required_int64_id:
+				way.id = message.get_int64();
+				break;
+			case Schema::Way::repeated_uint32_keys: {
+				auto pi = message.get_packed_uint32();
+				for (auto i : pi) {
+					way.keys.push_back(i);
+				}
+				break;
+			}
+			case Schema::Way::repeated_uint32_vals: {
+				auto pi = message.get_packed_uint32();
+				for (auto i : pi) {
+					way.vals.push_back(i);
+				}
+				break;
+			}
+			case Schema::Way::repeated_sint64_refs: {
+				auto pi = message.get_packed_sint64();
+				for (auto i : pi) {
+					ref += i;
+					way.refs.push_back(ref);
+				}
+				break;
+			}
+			case Schema::Way::repeated_sint64_lats: {
+				auto pi = message.get_packed_sint64();
+				for (auto i : pi) {
+					lat += i;
+					way.lats.push_back(lat);
+				}
+				break;
+			}
+			case Schema::Way::repeated_sint64_lons: {
+				auto pi = message.get_packed_sint64();
+				for (auto i : pi) {
+					lon += i;
+					way.lons.push_back(lon);
+				}
+				break;
+			}
+
+			default:
+				// ignore data for unknown tags to allow for future extensions
+				//std::cout << "Way: unknown tag: " << std::to_string(static_cast<uint32_t>(message.tag())) << std::endl;
+				message.skip();
+		}
+	}
+}
+
+PbfReader::Ways& PbfReader::PrimitiveGroup::ways() const {
+	return internalWays;
+}
+bool PbfReader::Ways::Iterator::operator!=(Ways::Iterator& other) const {
+	return offset != other.offset;
+}
+void PbfReader::Ways::Iterator::operator++() {
+	if (message.next()) {
+		readWay(message.get_view());
+		offset++;
+	} else {
+		offset = -1;
+	}
+}
+PbfReader::Way& PbfReader::Ways::Iterator::operator*() {
+	return way;
+}
+bool PbfReader::Ways::empty() {
+	return pg->type() != PrimitiveGroupType::Way;
+}
+PbfReader::Ways::Iterator PbfReader::Ways::begin() {
+	if (pg->type() != PrimitiveGroupType::Way)
+		return Ways::Iterator{protozero::pbf_message<Schema::PrimitiveGroup>{nullptr, 0ul}, -1, way};
+
+	protozero::pbf_message<Schema::PrimitiveGroup> message{pg->getDataView()};
+	if (message.next()) {
+		protozero::pbf_message<Schema::PrimitiveGroup> message{pg->getDataView()};
+		auto it = Ways::Iterator{message, -1, way};
+		++it;
+		return it;
+	}
+
+	return Ways::Iterator{message, -1, way};
+}
+PbfReader::Ways::Iterator PbfReader::Ways::end() {
+	return Ways::Iterator{protozero::pbf_message<Schema::PrimitiveGroup>{nullptr, 0ul}, -1, way};
+}
+
+void PbfReader::Relations::Iterator::readRelation(protozero::data_view data) {
+	protozero::pbf_message<Schema::Relation> message{data};
+
+	relation.id = 0;
+	relation.keys.clear();
+	relation.vals.clear();
+	relation.memids.clear();
+	relation.roles_sid.clear();
+	relation.types.clear();
+
+	uint64_t memid = 0;
+	
+	while (message.next()) {
+		switch (message.tag()) {
+			case Schema::Relation::required_int64_id:
+				relation.id = message.get_int64();
+				break;
+			case Schema::Relation::repeated_uint32_keys: {
+				auto pi = message.get_packed_uint32();
+				for (auto i : pi) {
+					relation.keys.push_back(i);
+				}
+				break;
+			}
+			case Schema::Relation::repeated_uint32_vals: {
+				auto pi = message.get_packed_uint32();
+				for (auto i : pi) {
+					relation.vals.push_back(i);
+				}
+				break;
+			}
+			case Schema::Relation::repeated_int32_roles_sid: {
+				auto pi = message.get_packed_int32();
+				for (auto i : pi) {
+					relation.roles_sid.push_back(i);
+				}
+				break;
+			}
+			case Schema::Relation::repeated_sint64_memids: {
+				auto pi = message.get_packed_sint64();
+				for (auto i : pi) {
+					memid += i;
+					relation.memids.push_back(memid);
+				}
+				break;
+			}
+			case Schema::Relation::repeated_MemberType_types: {
+				auto pi = message.get_packed_int32();
+				for (auto i : pi) {
+					relation.types.push_back(i);
+				}
+				break;
+			}
+
+			default:
+				// ignore data for unknown tags to allow for future extensions
+				//std::cout << "Way: unknown tag: " << std::to_string(static_cast<uint32_t>(message.tag())) << std::endl;
+				message.skip();
+		}
+	}
+}
+
+PbfReader::Relations& PbfReader::PrimitiveGroup::relations() const {
+	return internalRelations;
+}
+bool PbfReader::Relations::Iterator::operator!=(Relations::Iterator& other) const {
+	return offset != other.offset;
+}
+void PbfReader::Relations::Iterator::operator++() {
+	if (message.next()) {
+		readRelation(message.get_view());
+		offset++;
+	} else {
+		offset = -1;
+	}
+}
+PbfReader::Relation& PbfReader::Relations::Iterator::operator*() {
+	return relation;
+}
+bool PbfReader::Relations::empty() {
+	return pg->type() != PrimitiveGroupType::Relation;
+}
+PbfReader::Relations::Iterator PbfReader::Relations::begin() {
+	if (pg->type() != PrimitiveGroupType::Relation)
+		return Relations::Iterator{protozero::pbf_message<Schema::PrimitiveGroup>{nullptr, 0ul}, -1, relation};
+
+	protozero::pbf_message<Schema::PrimitiveGroup> message{pg->getDataView()};
+	if (message.next()) {
+		protozero::pbf_message<Schema::PrimitiveGroup> message{pg->getDataView()};
+		auto it = Relations::Iterator{message, -1, relation};
+		++it;
+		return it;
+	}
+
+	return Relations::Iterator{message, -1, relation};
+}
+PbfReader::Relations::Iterator PbfReader::Relations::end() {
+	return Relations::Iterator{protozero::pbf_message<Schema::PrimitiveGroup>{nullptr, 0ul}, -1, relation};
+}
+
+PbfReader::HeaderBlock PbfReader::PbfReader::readHeaderFromFile(std::istream& input) {
+	BlobHeader bh = readBlobHeader(input);
+	protozero::data_view blob = readBlob(bh.datasize, input);
+	HeaderBlock header = readHeaderBlock(blob);
+
+	return header;
+}
+
diff --git a/src/pooled_string.cpp b/src/pooled_string.cpp
new file mode 100644
index 00000000..500408d4
--- /dev/null
+++ b/src/pooled_string.cpp
@@ -0,0 +1,170 @@
+#include "pooled_string.h"
+#include <mutex>
+#include <cstring>
+
+namespace PooledStringNS {
+	std::vector<char*> tables;
+	std::mutex mutex;
+
+	const uint8_t ShortString = 0b00;
+	const uint8_t HeapString = 0b10;
+	const uint8_t StdString = 0b11;
+
+	// Each thread has its own string table, we only take a lock
+	// to push a new table onto the vector.
+	thread_local int64_t tableIndex = -1;
+	thread_local int64_t spaceLeft = -1;
+}
+
+PooledString::PooledString(const std::string& str) {
+	if (str.size() >= 65536)
+		throw std::runtime_error("cannot store string longer than 64K");
+
+	if (str.size() <= 15) {
+		storage[0] = str.size();
+		memcpy(storage + 1, str.data(), str.size());
+		memset(storage + 1 + str.size(), 0, 16 - 1 - str.size());
+	} else {
+		memset(storage + 8, 0, 8);
+		storage[0] = 1 << 7;
+
+		if (spaceLeft < 0 || spaceLeft < str.size()) {
+			std::lock_guard<std::mutex> lock(mutex);
+			spaceLeft = 65536;
+			char* buffer = (char*)malloc(spaceLeft);
+			if (buffer == 0)
+				throw std::runtime_error("PooledString could not malloc");
+			tables.push_back(buffer);
+			tableIndex = tables.size() - 1;
+		}
+
+		storage[1] = tableIndex >> 16;
+		storage[2] = tableIndex >> 8;
+		storage[3] = tableIndex;
+
+		uint16_t offset = 65536 - spaceLeft;
+		storage[4] = offset >> 8;
+		storage[5] = offset;
+
+		uint16_t length = str.size();
+		storage[6] = length >> 8;
+		storage[7] = length;
+
+		memcpy(tables[tableIndex] + offset, str.data(), str.size());
+
+		spaceLeft -= str.size();
+	}
+}
+
+PooledString::PooledString(const std::string* str) {
+	storage[0] = StdString << 6;
+
+	*(const std::string**)((void*)(storage + 8)) = str;
+}
+
+bool PooledStringNS::PooledString::operator==(const PooledString& other) const {
+	// NOTE: We have surprising equality semantics!
+	//
+	// If one of the strings is a StdString, it's value equality.
+	//
+	// Else, for short strings, you are equal if the strings are equal.
+	//
+	// For large strings, you are equal if you use the same heap memory locations.
+	// This implies that someone outside of PooledString is managing pooling! In our
+	// case, it is the responsibility of AttributePairStore.
+	uint8_t kind = storage[0] >> 6;
+	uint8_t otherKind = other.storage[0] >> 6;
+
+	if (kind == StdString || otherKind == StdString) {
+		size_t mySize = size();
+		if (mySize != other.size())
+			return false;
+
+		return memcmp(data(), other.data(), mySize) == 0;
+	}
+
+	return memcmp(storage, other.storage, 16) == 0;
+}
+
+bool PooledStringNS::PooledString::operator!=(const PooledString& other) const {
+	return !(*this == other);
+}
+
+const char* PooledStringNS::PooledString::data() const {
+	uint8_t kind = storage[0] >> 6;
+
+	if (kind == ShortString)
+		return (char *)(storage + 1);
+
+	if (kind == StdString) {
+		const std::string* str = *(const std::string**)((void*)(storage + 8));
+		return str->data();
+	}
+
+	uint32_t tableIndex = (storage[1] << 16) + (storage[2] << 8) + storage[3];
+	uint16_t offset = (storage[4] << 8) + storage[5];
+
+	const char* data = tables[tableIndex] + offset;
+	return data;
+}
+
+size_t PooledStringNS::PooledString::size() const {
+	uint8_t kind = storage[0] >> 6;
+	// If the uppermost bit is set, we're in heap.
+	if (kind == HeapString) {
+		uint16_t length = (storage[6] << 8) + storage[7];
+		return length;
+	}
+
+	if (kind == ShortString)
+		// Otherwise it's stored in the lower 7 bits of the highest byte.
+		return storage[0] & 0b01111111;
+
+	const std::string* str = *(const std::string**)((void*)(storage + 8));
+	return str->size();
+}
+
+std::string PooledStringNS::PooledString::toString() const {
+	std::string rv;
+	uint8_t kind = storage[0] >> 6;
+	if (kind == HeapString) {
+		// heap
+		rv.reserve(size());
+
+		uint32_t tableIndex = (storage[1] << 16) + (storage[2] << 8) + storage[3];
+		uint16_t offset = (storage[4] << 8) + storage[5];
+
+		char* data = tables[tableIndex] + offset;
+		rv.append(data, size());
+		return rv;
+	}
+
+	if (kind == ShortString) {
+		for (int i = 0; i < storage[0]; i++)
+			rv += storage[i + 1];
+		return rv;
+	}
+
+	const std::string* str = *(const std::string**)((void*)(storage + 8));
+	return *str;
+}
+
+void PooledStringNS::PooledString::ensureStringIsOwned() {
+	uint8_t kind = storage[0] >> 6;
+
+	if (kind != StdString)
+		return;
+
+	*this = PooledString(toString());
+}
+
+bool PooledStringNS::PooledString::operator<(const PooledString& other) const {
+	size_t mySize = size();
+	size_t otherSize = other.size();
+
+	if (mySize != otherSize)
+		return mySize < otherSize;
+
+	return memcmp(data(), other.data(), mySize) < 0;
+}
+
diff --git a/src/read_pbf.cpp b/src/read_pbf.cpp
deleted file mode 100644
index 605618fa..00000000
--- a/src/read_pbf.cpp
+++ /dev/null
@@ -1,592 +0,0 @@
-#include <iostream>
-#include "read_pbf.h"
-#include "pbf_blocks.h"
-
-#include <boost/interprocess/streams/bufferstream.hpp>
-#include <boost/asio/thread_pool.hpp>
-#include <boost/asio/post.hpp>
-#include <unordered_set>
-
-#include "node_store.h"
-#include "way_store.h"
-#include "osm_lua_processing.h"
-#include "mmap_allocator.h"
-
-using namespace std;
-
-const std::string OptionSortTypeThenID = "Sort.Type_then_ID";
-const std::string OptionLocationsOnWays = "LocationsOnWays";
-std::atomic<uint64_t> blocksProcessed(0), blocksToProcess(0);
-
-PbfReader::PbfReader(OSMStore &osmStore)
-	: osmStore(osmStore)
-{ }
-
-bool PbfReader::ReadNodes(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, const unordered_set<int> &nodeKeyPositions)
-{
-	// ----	Read nodes
-
-	if (pg.has_dense()) {
-		int64_t nodeId  = 0;
-		int lon = 0;
-		int lat = 0;
-		int kvPos = 0;
-		DenseNodes dense = pg.dense();
-
-		std::vector<NodeStore::element_t> nodes;		
-		for (int j=0; j<dense.id_size(); j++) {
-			nodeId += dense.id(j);
-			lon    += dense.lon(j);
-			lat    += dense.lat(j);
-			LatpLon node = { int(lat2latp(double(lat)/10000000.0)*10000000.0), lon };
-
-			bool significant = false;
-			int kvStart = kvPos;
-			if (dense.keys_vals_size()>0) {
-				while (dense.keys_vals(kvPos)>0) {
-					if (nodeKeyPositions.find(dense.keys_vals(kvPos)) != nodeKeyPositions.end()) {
-						significant = true;
-					}
-					kvPos+=2;
-				}
-				kvPos++;
-			}
-
-			nodes.push_back(std::make_pair(static_cast<NodeID>(nodeId), node));
-
-			if (significant) {
-				// For tagged nodes, call Lua, then save the OutputObject
-				boost::container::flat_map<std::string, std::string> tags;
-				tags.reserve(kvPos / 2);
-
-				for (uint n=kvStart; n<kvPos-1; n+=2) {
-					tags[pb.stringtable().s(dense.keys_vals(n))] = pb.stringtable().s(dense.keys_vals(n+1));
-				}
-				output.setNode(static_cast<NodeID>(nodeId), node, tags);
-			} 
-
-		}
-
-		osmStore.nodes.insert(nodes);
-		return true;
-	}
-	return false;
-}
-
-bool PbfReader::ReadWays(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb, bool locationsOnWays) {
-	// ----	Read ways
-
-	if (pg.ways_size() > 0) {
-		Way pbfWay;
-
-		const bool wayStoreRequiresNodes = osmStore.ways.requiresNodes();
-
-		std::vector<WayStore::ll_element_t> llWays;
-		std::vector<std::pair<WayID, std::vector<NodeID>>> nodeWays;
-
-		for (int j=0; j<pg.ways_size(); j++) {
-			pbfWay = pg.ways(j);
-			WayID wayId = static_cast<WayID>(pbfWay.id());
-			if (wayId >= pow(2,42)) throw std::runtime_error("Way ID negative or too large: "+std::to_string(wayId));
-
-			// Assemble nodelist
-			LatpLonVec llVec;
-			std::vector<NodeID> nodeVec;
-			if (locationsOnWays) {
-				int lat=0, lon=0;
-				llVec.reserve(pbfWay.lats_size());
-				for (int k=0; k<pbfWay.lats_size(); k++) {
-					lat += pbfWay.lats(k);
-					lon += pbfWay.lons(k);
-					LatpLon ll = { int(lat2latp(double(lat)/10000000.0)*10000000.0), lon };
-					llVec.push_back(ll);
-				}
-			} else {
-				int64_t nodeId = 0;
-				llVec.reserve(pbfWay.refs_size());
-				nodeVec.reserve(pbfWay.refs_size());
-				for (int k=0; k<pbfWay.refs_size(); k++) {
-					nodeId += pbfWay.refs(k);
-					try {
-						llVec.push_back(osmStore.nodes.at(static_cast<NodeID>(nodeId)));
-						nodeVec.push_back(nodeId);
-					} catch (std::out_of_range &err) {
-						if (osmStore.integrity_enforced()) throw err;
-					}
-				}
-			}
-			if (llVec.empty()) continue;
-
-			try {
-				tag_map_t tags;
-				readTags(pbfWay, pb, tags);
-				bool emitted = output.setWay(static_cast<WayID>(pbfWay.id()), llVec, tags);
-
-				// If we need it for later, store the way's coordinates in the global way store
-				if (emitted || osmStore.way_is_used(wayId)) {
-					if (wayStoreRequiresNodes)
-						nodeWays.push_back(std::make_pair(wayId, nodeVec));
-					else
-						llWays.push_back(std::make_pair(wayId, WayStore::latplon_vector_t(llVec.begin(), llVec.end())));
-				}
-
-			} catch (std::out_of_range &err) {
-				// Way is missing a node?
-				cerr << endl << err.what() << endl;
-			}
-
-		}
-
-		if (wayStoreRequiresNodes) {
-			osmStore.ways.insertNodes(nodeWays);
-		} else {
-			osmStore.ways.insertLatpLons(llWays);
-		}
-
-		return true;
-	}
-	return false;
-}
-
-bool PbfReader::ScanRelations(OsmLuaProcessing &output, PrimitiveGroup &pg, PrimitiveBlock const &pb) {
-	// Scan relations to see which ways we need to save
-	if (pg.relations_size()==0) return false;
-
-	int typeKey = findStringPosition(pb, "type");
-	int mpKey   = findStringPosition(pb, "multipolygon");
-
-	for (int j=0; j<pg.relations_size(); j++) {
-		Relation pbfRelation = pg.relations(j);
-		bool isMultiPolygon = RelationIsType(pbfRelation, typeKey, mpKey);
-		bool isAccepted = false;
-		WayID relid = static_cast<WayID>(pbfRelation.id());
-		if (!isMultiPolygon) {
-			if (output.canReadRelations()) {
-				tag_map_t tags;
-				readTags(pbfRelation, pb, tags);
-				isAccepted = output.scanRelation(relid, tags);
-			}
-			if (!isAccepted) continue;
-		}
-		int64_t lastID = 0;
-		for (int n=0; n < pbfRelation.memids_size(); n++) {
-			lastID += pbfRelation.memids(n);
-			if (pbfRelation.types(n) != Relation_MemberType_WAY) { continue; }
-			if (lastID >= pow(2,42)) throw std::runtime_error("Way ID in relation "+std::to_string(relid)+" negative or too large: "+std::to_string(lastID));
-			osmStore.mark_way_used(static_cast<WayID>(lastID));
-			if (isAccepted) { osmStore.relation_contains_way(relid, lastID); }
-		}
-	}
-	return true;
-}
-
-bool PbfReader::ReadRelations(
-	OsmLuaProcessing& output,
-	PrimitiveGroup& pg,
-	const PrimitiveBlock& pb,
-	const BlockMetadata& blockMetadata
-) {
-	// ----	Read relations
-
-	if (pg.relations_size() > 0) {
-		std::vector<RelationStore::element_t> relations;
-
-		int typeKey = findStringPosition(pb, "type");
-		int mpKey   = findStringPosition(pb, "multipolygon");
-		int boundaryKey = findStringPosition(pb, "boundary");
-		int innerKey= findStringPosition(pb, "inner");
-		int outerKey= findStringPosition(pb, "outer");
-		if (typeKey >-1 && mpKey>-1) {
-			for (int j=0; j<pg.relations_size(); j++) {
-				if (j % blockMetadata.chunks != blockMetadata.chunk)
-					continue;
-
-				Relation pbfRelation = pg.relations(j);
-				bool isMultiPolygon = RelationIsType(pbfRelation, typeKey, mpKey);
-				bool isBoundary = RelationIsType(pbfRelation, typeKey, boundaryKey);
-				if (!isMultiPolygon && !isBoundary && !output.canWriteRelations()) continue;
-
-				// Read relation members
-				WayVec outerWayVec, innerWayVec;
-				int64_t lastID = 0;
-				bool isInnerOuter = isBoundary || isMultiPolygon;
-				for (int n=0; n < pbfRelation.memids_size(); n++) {
-					lastID += pbfRelation.memids(n);
-					if (pbfRelation.types(n) != Relation_MemberType_WAY) { continue; }
-					int32_t role = pbfRelation.roles_sid(n);
-					if (role==innerKey || role==outerKey) isInnerOuter=true;
-					WayID wayId = static_cast<WayID>(lastID);
-					(role == innerKey ? innerWayVec : outerWayVec).push_back(wayId);
-				}
-
-				try {
-					tag_map_t tags;
-					readTags(pbfRelation, pb, tags);
-					output.setRelation(pbfRelation.id(), outerWayVec, innerWayVec, tags, isMultiPolygon, isInnerOuter);
-
-				} catch (std::out_of_range &err) {
-					// Relation is missing a member?
-					cerr << endl << err.what() << endl;
-				}
-			}
-		}
-
-		osmStore.relations_insert_front(relations);
-		return true;
-	}
-	return false;
-}
-
-// Returns true when block was completely handled, thus could be omited by another phases.
-bool PbfReader::ReadBlock(
-	std::istream& infile,
-	OsmLuaProcessing& output,
-	const BlockMetadata& blockMetadata,
-	const unordered_set<string>& nodeKeys,
-	bool locationsOnWays,
-	ReadPhase phase
-) 
-{
-	infile.seekg(blockMetadata.offset);
-
-	PrimitiveBlock pb;
-	readBlock(&pb, blockMetadata.length, infile);
-	if (infile.eof()) {
-		return true;
-	}
-
-	// Keep count of groups read during this phase.
-	std::size_t read_groups = 0;
-
-	// Read the string table, and pre-calculate the positions of valid node keys
-	unordered_set<int> nodeKeyPositions;
-	for (auto it : nodeKeys) {
-		nodeKeyPositions.insert(findStringPosition(pb, it.c_str()));
-	}
-
-	for (int i=0; i<pb.primitivegroup_size(); i++) {
-		PrimitiveGroup pg;
-		pg = pb.primitivegroup(i);
-	
-		auto output_progress = [&]()
-		{
-			if (ioMutex.try_lock()) {
-				std::ostringstream str;
-				void_mmap_allocator::reportStoreSize(str);
-				str << "Block " << blocksProcessed.load() << "/" << blocksToProcess.load() << " ways " << pg.ways_size() << " relations " << pg.relations_size() << "                  \r";
-				std::cout << str.str();
-				std::cout.flush();
-				ioMutex.unlock();
-			}
-		};
-
-		if(phase == ReadPhase::Nodes) {
-			bool done = ReadNodes(output, pg, pb, nodeKeyPositions);
-			if(done) { 
-				output_progress();
-				++read_groups;
-				continue;
-			}
-		}
-
-		if(phase == ReadPhase::RelationScan) {
-			osmStore.ensureUsedWaysInited();
-			bool done = ScanRelations(output, pg, pb);
-			if(done) { 
-				std::cout << "(Scanning for ways used in relations: " << (100*blocksProcessed.load()/blocksToProcess.load()) << "%)\r";
-				std::cout.flush();
-				continue;
-			}
-		}
-	
-		if(phase == ReadPhase::Ways) {
-			bool done = ReadWays(output, pg, pb, locationsOnWays);
-			if(done) { 
-				output_progress();
-				++read_groups;
-				continue;
-			}
-		}
-
-		if(phase == ReadPhase::Relations) {
-			bool done = ReadRelations(output, pg, pb, blockMetadata);
-			if(done) { 
-				output_progress();
-				++read_groups;
-				continue;
-			}
-		}
-	}
-
-	// Possible cases of a block contents:
-	// - single group
-	// - multiple groups of the same type
-	// - multiple groups of the different type
-	// 
-	// In later case block would not be handled during this phase, and should be
-	// read again in remaining phases. Thus we return false to indicate that the
-	// block was not handled completelly.
-	if(read_groups != pb.primitivegroup_size()) {
-		return false;
-	}
-
-	// We can only delete blocks if we're confident we've processed everything,
-	// which is not possible in the case of subdivided blocks.
-	return blockMetadata.chunks == 1;
-}
-
-bool blockHasPrimitiveGroupSatisfying(
-	std::istream& infile,
-	const BlockMetadata block,
-	std::function<bool(const PrimitiveGroup&)> test
-) {
-	PrimitiveBlock pb;
-
-	// We may have previously read to EOF, so clear the internal error state
-	infile.clear();
-	infile.seekg(block.offset);
-	readBlock(&pb, block.length, infile);
-	if (infile.eof()) {
-		throw std::runtime_error("blockHasPrimitiveGroupSatisfying got unexpected eof");
-	}
-
-	for (int i=0; i<pb.primitivegroup_size(); i++) {
-		PrimitiveGroup pg;
-		pg = pb.primitivegroup(i);
-
-		if (test(pg))
-			return false;
-	}
-	
-	return true;
-}
-
-int PbfReader::ReadPbfFile(
-	bool hasSortTypeThenID,
-	unordered_set<string> const& nodeKeys,
-	unsigned int threadNum,
-	const pbfreader_generate_stream& generate_stream,
-	const pbfreader_generate_output& generate_output
-)
-{
-	auto infile = generate_stream();
-
-	// ----	Read PBF
-	osmStore.clear();
-
-	HeaderBlock block;
-	readBlock(&block, readHeader(*infile).datasize(), *infile);
-	bool locationsOnWays = false;
-	for (std::string option : block.optional_features()) {
-		if (option == OptionLocationsOnWays) {
-			std::cout << ".osm.pbf file has locations on ways" << std::endl;
-			locationsOnWays = true;
-		}
-	}
-
-	std::map<std::size_t, BlockMetadata> blocks;
-
-	// Track the filesize - note that we can't rely on tellg(), as
-	// its meant to be an opaque token useful only for seeking.
-	size_t filesize = 0;
-	while (true) {
-		BlobHeader bh = readHeader(*infile);
-		filesize += bh.datasize();
-		if (infile->eof()) {
-			break;
-		}
-
-		blocks[blocks.size()] = { (long int)infile->tellg(), bh.datasize(), true, true, true, 0, 1 };
-		infile->seekg(bh.datasize(), std::ios_base::cur);
-	}
-
-	if (hasSortTypeThenID) {
-		// The PBF's blocks are sorted by type, then ID. We can do a binary search
-		// to learn where the blocks transition between object types, which
-		// enables a more efficient partitioning of work for reading.
-		std::vector<size_t> indexes;
-		for (int i = 0; i < blocks.size(); i++)
-			indexes.push_back(i);
-
-		const auto& waysStart = std::lower_bound(
-			indexes.begin(),
-			indexes.end(),
-			0,
-			[&blocks, &infile](const auto &i, const auto &ignored) {
-				return blockHasPrimitiveGroupSatisfying(
-					*infile,
-					blocks[i],
-					[](const PrimitiveGroup&pg) { return pg.ways_size() > 0 || pg.relations_size() > 0; }
-				);
-			}
-		);
-
-		const auto& relationsStart = std::lower_bound(
-			indexes.begin(),
-			indexes.end(),
-			0,
-			[&blocks, &infile](const auto &i, const auto &ignored) {
-				return blockHasPrimitiveGroupSatisfying(
-					*infile,
-					blocks[i],
-					[](const PrimitiveGroup&pg) { return pg.relations_size() > 0; }
-				);
-			}
-		);
-
-		for (auto it = indexes.begin(); it != indexes.end(); it++) {
-			blocks[*it].hasNodes = it <= waysStart;
-			blocks[*it].hasWays = it >= waysStart && it <= relationsStart;
-			blocks[*it].hasRelations = it >= relationsStart;
-		}
-	}
-
-
-	// PBFs generated by Osmium have 8,000 entities per block,
-	// and each block is about 64KB.
-	//
-	// PBFs generated by osmconvert (e.g., BBBike PBFs) have as
-	// many entities as fit in 31MB. Each block is about 16MB.
-	//
-	// Osmium PBFs seem to be processed about 3x faster than osmconvert
-	// PBFs, so try to hint to the user when they could speed up their
-	// pipeline.
-	if (filesize / blocks.size() > 1000000) {
-		std::cout << "warning: PBF has very large blocks, which may slow processing" << std::endl;
-		std::cout << "         to fix: osmium cat -f pbf your-file.osm.pbf -o optimized.osm.pbf" << std::endl;
-	}
-
-
-	std::vector<ReadPhase> all_phases = { ReadPhase::Nodes, ReadPhase::RelationScan, ReadPhase::Ways, ReadPhase::Relations };
-	for(auto phase: all_phases) {
-		// Launch the pool with threadNum threads
-		boost::asio::thread_pool pool(threadNum);
-		std::mutex block_mutex;
-
-		// If we're in ReadPhase::Relations and there aren't many blocks left
-		// to read, increase parallelism by letting each thread only process
-		// a portion of the block.
-		if (phase == ReadPhase::Relations && blocks.size() < threadNum * 2) {
-			std::cout << "only " << blocks.size() << " relation blocks; subdividing for better parallelism" << std::endl;
-			std::map<std::size_t, BlockMetadata> moreBlocks;
-			for (const auto& block : blocks) {
-				BlockMetadata newBlock = block.second;
-				newBlock.chunks = threadNum;
-				for (size_t i = 0; i < threadNum; i++) {
-					newBlock.chunk = i;
-					moreBlocks[moreBlocks.size()] = newBlock;
-				}
-			}
-			blocks = moreBlocks;
-		}
-
-		std::deque<std::vector<IndexedBlockMetadata>> blockRanges;
-		std::map<std::size_t, BlockMetadata> filteredBlocks;
-		for (const auto& entry : blocks) {
-			if ((phase == ReadPhase::Nodes && entry.second.hasNodes) ||
-					(phase == ReadPhase::RelationScan && entry.second.hasRelations) ||
-					(phase == ReadPhase::Ways && entry.second.hasWays) ||
-					(phase == ReadPhase::Relations && entry.second.hasRelations))
-				filteredBlocks[entry.first] = entry.second;
-		}
-
-		blocksToProcess = filteredBlocks.size();
-		blocksProcessed = 0;
-
-		// When processing blocks, we try to give each worker large batches
-		// of contiguous blocks, so that they might benefit from long runs
-		// of sorted indexes, and locality of nearby IDs.
-		const size_t batchSize = (filteredBlocks.size() / (threadNum * 8)) + 1;
-
-		size_t consumed = 0;
-		auto it = filteredBlocks.begin();
-		while(it != filteredBlocks.end()) {
-			std::vector<IndexedBlockMetadata> blockRange;
-			blockRange.reserve(batchSize);
-			size_t max = consumed + batchSize;
-			for (; consumed < max && it != filteredBlocks.end(); consumed++) {
-				IndexedBlockMetadata ibm;
-				memcpy(&ibm, &it->second, sizeof(BlockMetadata));
-				ibm.index = it->first;
-				blockRange.push_back(ibm);
-				it++;
-			}
-			blockRanges.push_back(blockRange);
-		}
-
-		{
-			for(const std::vector<IndexedBlockMetadata>& blockRange: blockRanges) {
-				boost::asio::post(pool, [=, &blockRange, &blocks, &block_mutex, &nodeKeys]() {
-					if (phase == ReadPhase::Nodes)
-						osmStore.nodes.batchStart();
-					if (phase == ReadPhase::Ways)
-						osmStore.ways.batchStart();
-
-					for (const IndexedBlockMetadata& indexedBlockMetadata: blockRange) {
-						auto infile = generate_stream();
-						auto output = generate_output();
-
-						if(ReadBlock(*infile, *output, indexedBlockMetadata, nodeKeys, locationsOnWays, phase)) {
-							const std::lock_guard<std::mutex> lock(block_mutex);
-							blocks.erase(indexedBlockMetadata.index);	
-							blocksProcessed++;
-						}
-					}
-				});
-			}
-		}
-	
-		pool.join();
-
-		if(phase == ReadPhase::Nodes) {
-			osmStore.nodes.finalize(threadNum);
-		}
-		if(phase == ReadPhase::Ways) {
-			osmStore.ways.finalize(threadNum);
-		}
-	}
-	return 0;
-}
-
-// Find a string in the dictionary
-int PbfReader::findStringPosition(PrimitiveBlock const &pb, char const *str) {
-	for (int i=0; i<pb.stringtable().s_size(); i++) {
-		if(pb.stringtable().s(i) == str)
-			return i;
-	}
-	return -1;
-}
-
-
-// *************************************************
-
-int ReadPbfBoundingBox(const std::string &inputFile, double &minLon, double &maxLon, 
-	double &minLat, double &maxLat, bool &hasClippingBox)
-{
-	fstream infile(inputFile, ios::in | ios::binary);
-	if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; }
-	HeaderBlock block;
-	readBlock(&block, readHeader(infile).datasize(), infile);
-	if (block.has_bbox()) {
-		hasClippingBox = true;		
-		minLon = block.bbox().left()  /1000000000.0;
-		maxLon = block.bbox().right() /1000000000.0;
-		minLat = block.bbox().bottom()/1000000000.0;
-		maxLat = block.bbox().top()   /1000000000.0;
-	}
-	infile.close();
-	return 0;
-}
-
-bool PbfHasOptionalFeature(const std::string& inputFile, const std::string& feature) {
-	fstream infile(inputFile, ios::in | ios::binary);
-	if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; }
-	HeaderBlock block;
-	readBlock(&block, readHeader(infile).datasize(), infile);
-
-	for (const std::string& option: block.optional_features())
-		if (option == feature)
-			return true;
-
-	return false;
-}
diff --git a/src/sharded_node_store.cpp b/src/sharded_node_store.cpp
new file mode 100644
index 00000000..0d915fbd
--- /dev/null
+++ b/src/sharded_node_store.cpp
@@ -0,0 +1,103 @@
+#include "sharded_node_store.h"
+
+thread_local size_t lastNodeShard = 0;
+
+ShardedNodeStore::ShardedNodeStore(std::function<std::shared_ptr<NodeStore>()> createNodeStore):
+	createNodeStore(createNodeStore) {
+	for (int i = 0; i < shards(); i++)
+		stores.push_back(createNodeStore());
+}
+
+ShardedNodeStore::~ShardedNodeStore() {
+}
+
+void ShardedNodeStore::reopen() {
+	for (auto& store : stores)
+		store->reopen();
+}
+
+void ShardedNodeStore::finalize(size_t threadNum) {
+	for (auto& store : stores)
+		store->finalize(threadNum);
+}
+
+LatpLon ShardedNodeStore::at(NodeID id) const {
+	for (int i = 0; i < shards(); i++) {
+		size_t index = (lastNodeShard + i) % shards();
+
+		if (stores[index]->contains(0, id)) {
+			lastNodeShard = index;
+			return stores[index]->at(id);
+		}
+	}
+
+	// Superfluous return to silence a compiler warning
+	return stores[shards() - 1]->at(id);
+}
+
+size_t ShardedNodeStore::size() const {
+	size_t rv = 0;
+	for (auto& store : stores)
+		rv += store->size();
+
+	return rv;
+}
+
+void ShardedNodeStore::batchStart() {
+	for (auto& store : stores)
+		store->batchStart();
+}
+
+size_t pickStore(const LatpLon& el) {
+	// Assign the element to a shard. This is a pretty naive division
+	// of the globe, tuned to have max ~10GB of nodes/ways per shard.
+
+	const size_t z5x = lon2tilex(el.lon / 10000000, 5);
+	const size_t z5y = latp2tiley(el.latp / 10000000, 5);
+
+	const size_t z4x = z5x / 2;
+	const size_t z4y = z5y / 2;
+
+	const size_t z3x = z4x / 2;
+	const size_t z3y = z4y / 2;
+
+	if (z3x == 5 && z3y == 2) return 5; // Western Russia
+	if (z3x == 4 && z3y == 3) return 5; // North Africa
+	if (z3x == 5 && z3y == 3) return 5; // India
+
+	if ((z5x == 16 && z5y == 10) || (z5x == 16 && z5y == 11)) return 4; // some of Central Europe
+	if ((z5x == 17 && z5y == 10) || (z5x == 17 && z5y == 11)) return 1; // some more of Central Europe
+
+	if (z3x == 4 && z3y == 2) return 3; // rest of Central Europe
+
+	const size_t z2x = z3x / 2;
+	const size_t z2y = z3y / 2;
+
+	if (z2x == 3 && z2y == 1) return 3; // Asia, Russia
+	if (z2x == 1 && z2y == 1) return 2; // North Atlantic Ocean and bordering countries
+	if (z2x == 0 && z2y == 1) return 1; // North America
+
+//	std::cout << "z2x=" << std::to_string(z2x) << ", z2y=" << std::to_string(z2y) << std::endl;
+	return 0; // Artic, Antartcica, Oceania, South Africa, South America
+}
+
+void ShardedNodeStore::insert(const std::vector<element_t>& elements) {
+	std::vector<std::vector<element_t>> perStore(shards());
+
+	for (const auto& el : elements) {
+		perStore[pickStore(el.second)].push_back(el);
+	}
+
+	for (int i = 0; i < shards(); i++) {
+		if (!perStore[i].empty())
+			stores[i]->insert(perStore[i]);
+	}
+}
+
+bool ShardedNodeStore::contains(size_t shard, NodeID id) const {
+	return stores[shard]->contains(0, id);
+}
+
+size_t ShardedNodeStore::shards() const {
+	return 6;
+}
diff --git a/src/sharded_way_store.cpp b/src/sharded_way_store.cpp
new file mode 100644
index 00000000..d9741082
--- /dev/null
+++ b/src/sharded_way_store.cpp
@@ -0,0 +1,81 @@
+#include "sharded_way_store.h"
+#include "node_store.h"
+
+thread_local size_t lastWayShard = 0;
+
+ShardedWayStore::ShardedWayStore(std::function<std::shared_ptr<WayStore>()> createWayStore, const NodeStore& nodeStore):
+	createWayStore(createWayStore),
+	nodeStore(nodeStore) {
+	for (int i = 0; i < shards(); i++)
+		stores.push_back(createWayStore());
+}
+
+ShardedWayStore::~ShardedWayStore() {
+}
+
+void ShardedWayStore::reopen() {
+	for (auto& store : stores)
+		store->reopen();
+}
+
+void ShardedWayStore::batchStart() {
+	for (auto& store : stores)
+		store->batchStart();
+}
+
+std::vector<LatpLon> ShardedWayStore::at(WayID wayid) const {
+	for (int i = 0; i < shards(); i++) {
+		size_t index = (lastWayShard + i) % shards();
+		if (stores[index]->contains(0, wayid)) {
+			lastWayShard = index;
+			return stores[index]->at(wayid);
+		}
+	}
+
+	// Superfluous return to silence a compiler warning
+	return stores[shards() - 1]->at(wayid);
+}
+
+bool ShardedWayStore::requiresNodes() const {
+	return stores[0]->requiresNodes();
+}
+
+void ShardedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays) {
+	throw std::runtime_error("ShardedWayStore::insertLatpLons: don't call this directly");
+}
+
+void ShardedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+	throw std::runtime_error("ShardedWayStore::insertNodes: don't call this directly");
+}
+
+void ShardedWayStore::clear() {
+	for (auto& store : stores)
+		store->clear();
+}
+
+std::size_t ShardedWayStore::size() const {
+	size_t rv = 0;
+	for (auto& store : stores)
+		rv += store->size();
+	return rv;
+}
+
+void ShardedWayStore::finalize(unsigned int threadNum) {
+	for (auto& store : stores)
+		store->finalize(threadNum);
+}
+
+bool ShardedWayStore::contains(size_t shard, WayID id) const {
+	return stores[shard]->contains(0, id);
+}
+
+WayStore& ShardedWayStore::shard(size_t shard) {
+	return *stores[shard].get();
+}
+
+const WayStore& ShardedWayStore::shard(size_t shard) const {
+	return *stores[shard].get();
+}
+
+size_t ShardedWayStore::shards() const { return nodeStore.shards(); }
+
diff --git a/src/shared_data.cpp b/src/shared_data.cpp
index 78cfe11d..da9787d8 100644
--- a/src/shared_data.cpp
+++ b/src/shared_data.cpp
@@ -10,7 +10,7 @@ using namespace rapidjson;
 
 SharedData::SharedData(Config &configIn, const class LayerDefinition &layers)
 	: layers(layers), config(configIn) {
-	outputMode=OUTPUT_FILE;
+	outputMode=OptionsParser::OutputMode::File;
 	mergeSqlite=false;
 }
 
diff --git a/src/sorted_node_store.cpp b/src/sorted_node_store.cpp
index 76aa81b8..82dccb55 100644
--- a/src/sorted_node_store.cpp
+++ b/src/sorted_node_store.cpp
@@ -2,7 +2,6 @@
 #include <algorithm>
 #include <cstring>
 #include <string>
-#include <atomic>
 #include <map>
 #include <bitset>
 #include "sorted_node_store.h"
@@ -16,40 +15,51 @@ namespace SortedNodeStoreTypes {
 	const uint16_t ChunkAlignment = 16;
 	const uint32_t ChunkCompressed = 1 << 31;
 
-	std::atomic<uint64_t> totalGroups;
-	std::atomic<uint64_t> totalNodes;
-	std::atomic<uint64_t> totalGroupSpace;
-	std::atomic<uint64_t> totalAllocatedSpace;
-	std::atomic<uint64_t> totalChunks;
-	std::atomic<uint64_t> chunkSizeFreqs[257];
-	std::atomic<uint64_t> groupSizeFreqs[257];
-
-
-	// When SortedNodeStore first starts, it's not confident that it has seen an
-	// entire segment, so it's in "collecting orphans" mode. Once it crosses a
-	// threshold of 64K elements, it ceases to be in this mode.
-	//
-	// Orphans are rounded up across multiple threads, and dealt with in
-	// the finalize step.
-	thread_local bool collectingOrphans = true;
-	thread_local uint64_t groupStart = -1;
-	thread_local std::vector<NodeStore::element_t>* localNodes = nullptr;
-
-	thread_local int64_t cachedChunk = -1;
-	thread_local std::vector<int32_t> cacheChunkLons;
-	thread_local std::vector<int32_t> cacheChunkLatps;
-
-	thread_local uint32_t arenaSpace = 0;
-	thread_local char* arenaPtr = nullptr;
+	struct ThreadStorage {
+		ThreadStorage():
+			collectingOrphans(true),
+			groupStart(-1),
+			localNodes(nullptr),
+			cachedChunk(-1),
+			arenaSpace(0),
+			arenaPtr(nullptr) {}
+		// When SortedNodeStore first starts, it's not confident that it has seen an
+		// entire segment, so it's in "collecting orphans" mode. Once it crosses a
+		// threshold of 64K elements, it ceases to be in this mode.
+		//
+		// Orphans are rounded up across multiple threads, and dealt with in
+		// the finalize step.
+		bool collectingOrphans = true;
+		uint64_t groupStart = -1;
+		std::vector<NodeStore::element_t>* localNodes = nullptr;
+
+		int64_t cachedChunk = -1;
+		std::vector<int32_t> cacheChunkLons;
+		std::vector<int32_t> cacheChunkLatps;
+
+		uint32_t arenaSpace = 0;
+		char* arenaPtr = nullptr;
+	};
+
+	thread_local std::deque<std::pair<const SortedNodeStore*, ThreadStorage>> threadStorage;
+
+	ThreadStorage& s(const SortedNodeStore* who) {
+		for (auto& entry : threadStorage)
+			if (entry.first == who)
+				return entry.second;
+
+		threadStorage.push_back(std::make_pair(who, ThreadStorage()));
+
+		auto& rv = threadStorage.back();
+		return rv.second;
+	}
 }
 
 using namespace SortedNodeStoreTypes;
 
 SortedNodeStore::SortedNodeStore(bool compressNodes): compressNodes(compressNodes) {
-	// Each group can store 64K nodes. If we allocate 256K slots
-	// for groups, we support 2^34 = 17B nodes, or about twice
-	// the number used by OSM as of November 2023.
-	groups.resize(256 * 1024);
+	s(this); // allocate our ThreadStorage before multi-threading
+	reopen();
 }
 
 void SortedNodeStore::reopen()
@@ -61,11 +71,16 @@ void SortedNodeStore::reopen()
 	totalNodes = 0;
 	totalGroups = 0;
 	totalGroupSpace = 0;
+	totalAllocatedSpace = 0;
 	totalChunks = 0;
 	memset(chunkSizeFreqs, 0, sizeof(chunkSizeFreqs));
 	memset(groupSizeFreqs, 0, sizeof(groupSizeFreqs));
 	orphanage.clear();
 	workerBuffers.clear();
+
+	// Each group can store 64K nodes. If we allocate 256K slots
+	// for groups, we support 2^34 = 17B nodes, or about twice
+	// the number used by OSM as of November 2023.
 	groups.clear();
 	groups.resize(256 * 1024);
 }
@@ -73,6 +88,48 @@ void SortedNodeStore::reopen()
 SortedNodeStore::~SortedNodeStore() {
 	for (const auto entry: allocatedMemory)
 		void_mmap_allocator::deallocate(entry.first, entry.second);
+
+	s(this) = ThreadStorage();
+}
+
+bool SortedNodeStore::contains(size_t shard, NodeID id) const {
+	const size_t groupIndex = id / (GroupSize * ChunkSize);
+	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
+	const uint64_t chunkMaskByte = chunk / 8;
+	const uint64_t chunkMaskBit = chunk % 8;
+
+	const uint64_t nodeMaskByte = (id % ChunkSize) / 8;
+	const uint64_t nodeMaskBit = id % 8;
+
+	GroupInfo* groupPtr = groups[groupIndex];
+
+	if (groupPtr == nullptr)
+		return false;
+
+	size_t chunkOffset = 0;
+	{
+		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
+		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
+		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
+		chunkOffset += popcnt(&maskByte, 1);
+
+		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
+			return false;
+	}
+
+	uint16_t scaledOffset = groupPtr->chunkOffsets[chunkOffset];
+	ChunkInfoBase* basePtr = (ChunkInfoBase*)(((char *)(groupPtr->chunkOffsets + popcnt(groupPtr->chunkMask, 32))) + (scaledOffset * ChunkAlignment));
+
+	size_t nodeOffset = 0;
+	nodeOffset = popcnt(basePtr->nodeMask, nodeMaskByte);
+	uint8_t maskByte = basePtr->nodeMask[nodeMaskByte];
+	maskByte = maskByte & ((1 << nodeMaskBit) - 1);
+	nodeOffset += popcnt(&maskByte, 1);
+	if (!(basePtr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
+		return false;
+
+
+	return true;
 }
 
 LatpLon SortedNodeStore::at(const NodeID id) const {
@@ -109,29 +166,30 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		size_t latpSize = (ptr->flags >> 10) & ((1 << 10) - 1);
 		// TODO: we don't actually need the lonSize to decompress the data.
 		//       May as well store it as a sanity check for now.
-		size_t lonSize = ptr->flags & ((1 << 10) - 1);
+		// size_t lonSize = ptr->flags & ((1 << 10) - 1);
 		size_t n = popcnt(ptr->nodeMask, 32) - 1;
 
 		const size_t neededChunk = groupIndex * ChunkSize + chunk;
 
 		// Really naive caching strategy - just cache the last-used chunk.
 		// Probably good enough?
-		if (cachedChunk != neededChunk) {
-			cachedChunk = neededChunk;
-			cacheChunkLons.reserve(256);
-			cacheChunkLatps.reserve(256);
+		ThreadStorage& tls = s(this);
+		if (tls.cachedChunk != neededChunk) {
+			tls.cachedChunk = neededChunk;
+			tls.cacheChunkLons.reserve(256);
+			tls.cacheChunkLatps.reserve(256);
 
 			uint8_t* latpData = ptr->data;
 			uint8_t* lonData = ptr->data + latpSize;
 			uint32_t recovdata[256] = {0};
 
 			streamvbyte_decode(latpData, recovdata, n);
-			cacheChunkLatps[0] = ptr->firstLatp;
-			zigzag_delta_decode(recovdata, &cacheChunkLatps[1], n, cacheChunkLatps[0]);
+			tls.cacheChunkLatps[0] = ptr->firstLatp;
+			zigzag_delta_decode(recovdata, &tls.cacheChunkLatps[1], n, tls.cacheChunkLatps[0]);
 
 			streamvbyte_decode(lonData, recovdata, n);
-			cacheChunkLons[0] = ptr->firstLon;
-			zigzag_delta_decode(recovdata, &cacheChunkLons[1], n, cacheChunkLons[0]);
+			tls.cacheChunkLons[0] = ptr->firstLon;
+			zigzag_delta_decode(recovdata, &tls.cacheChunkLons[1], n, tls.cacheChunkLons[0]);
 		}
 
 		size_t nodeOffset = 0;
@@ -142,7 +200,7 @@ LatpLon SortedNodeStore::at(const NodeID id) const {
 		if (!(ptr->nodeMask[nodeMaskByte] & (1 << nodeMaskBit)))
 			throw std::out_of_range("SortedNodeStore: node " + std::to_string(id) + " missing, no node");
 
-		return { cacheChunkLatps[nodeOffset], cacheChunkLons[nodeOffset] };
+		return { tls.cacheChunkLatps[nodeOffset], tls.cacheChunkLons[nodeOffset] };
 	}
 
 	UncompressedChunkInfo* ptr = (UncompressedChunkInfo*)basePtr;
@@ -184,58 +242,60 @@ size_t SortedNodeStore::size() const {
 }
 
 void SortedNodeStore::insert(const std::vector<element_t>& elements) {
-	if (localNodes == nullptr) {
+	ThreadStorage& tls = s(this);
+	if (tls.localNodes == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedNodeStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<element_t>());
-		localNodes = &workerBuffers.back();
+		tls.localNodes = &workerBuffers.back();
 	}
 
-	if (groupStart == -1) {
+	if (tls.groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		tls.groupStart = elements[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (collectingOrphans && i < elements.size()) {
+	while (tls.collectingOrphans && i < elements.size()) {
 		const element_t& el = elements[i];
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			collectingOrphans = false;
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			tls.collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*localNodes);
-			localNodes->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*tls.localNodes);
+			tls.localNodes->clear();
 		}
-		localNodes->push_back(el);
+		tls.localNodes->push_back(el);
 		i++;
 	}
 
 	while(i < elements.size()) {
 		const element_t& el = elements[i];
 
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*localNodes);
-			localNodes->clear();
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*tls.localNodes);
+			tls.localNodes->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		localNodes->push_back(el);
+		tls.localNodes->push_back(el);
 		i++;
 	}
 }
 
 void SortedNodeStore::batchStart() {
-	collectingOrphans = true;
-	groupStart = -1;
-	if (localNodes == nullptr || localNodes->size() == 0)
+	ThreadStorage& tls = s(this);
+	tls.collectingOrphans = true;
+	tls.groupStart = -1;
+	if (tls.localNodes == nullptr || tls.localNodes->size() == 0)
 		return;
 
-	collectOrphans(*localNodes);
-	localNodes->clear();
+	collectOrphans(*tls.localNodes);
+	tls.localNodes->clear();
 }
 
 void SortedNodeStore::finalize(size_t threadNum) {
@@ -264,7 +324,7 @@ void SortedNodeStore::finalize(size_t threadNum) {
 
 	orphanage.clear();
 
-	std::cout << "SortedNodeStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes (" << (1000ull * (totalAllocatedSpace.load() - totalGroupSpace.load()) / totalAllocatedSpace.load()) / 10.0 << "% wasted)" << std::endl;
+	std::cout << "SortedNodeStore: " << totalGroups << " groups, " << totalChunks << " chunks, " << totalNodes.load() << " nodes, " << totalGroupSpace.load() << " bytes (" << (1000ull * (totalAllocatedSpace.load() - totalGroupSpace.load()) / (totalAllocatedSpace.load() + 1)) / 10.0 << "% wasted)" << std::endl;
 	/*
 	for (int i = 0; i < 257; i++)
 		std::cout << "chunkSizeFreqs[ " << i << " ]= " << chunkSizeFreqs[i].load() << std::endl;
@@ -410,22 +470,23 @@ void SortedNodeStore::publishGroup(const std::vector<element_t>& nodes) {
 
 	GroupInfo* groupInfo = nullptr;
 
-	if (arenaSpace < groupSpace) {
+	ThreadStorage& tls = s(this);
+	if (tls.arenaSpace < groupSpace) {
 		// A full group takes ~330KB. Nodes are read _fast_, and there ends
 		// up being contention calling the allocator when reading the
 		// planet on a machine with 48 cores -- so allocate in large chunks.
-		arenaSpace = 4 * 1024 * 1024;
-		totalAllocatedSpace += arenaSpace;
-		arenaPtr = (char*)void_mmap_allocator::allocate(arenaSpace);
-		if (arenaPtr == nullptr)
+		tls.arenaSpace = 4 * 1024 * 1024;
+		totalAllocatedSpace += tls.arenaSpace;
+		tls.arenaPtr = (char*)void_mmap_allocator::allocate(tls.arenaSpace);
+		if (tls.arenaPtr == nullptr)
 			throw std::runtime_error("SortedNodeStore: failed to allocate arena");
 		std::lock_guard<std::mutex> lock(orphanageMutex);
-		allocatedMemory.push_back(std::make_pair((void*)arenaPtr, arenaSpace));
+		allocatedMemory.push_back(std::make_pair((void*)tls.arenaPtr, tls.arenaSpace));
 	}
 
-	arenaSpace -= groupSpace;
-	groupInfo = (GroupInfo*)arenaPtr;
-	arenaPtr += groupSpace;
+	tls.arenaSpace -= groupSpace;
+	groupInfo = (GroupInfo*)tls.arenaPtr;
+	tls.arenaPtr += groupSpace;
 
 	if (groups[groupIndex] != nullptr)
 		throw std::runtime_error("SortedNodeStore: group already present");
diff --git a/src/sorted_way_store.cpp b/src/sorted_way_store.cpp
index 8fdaa806..302deab9 100644
--- a/src/sorted_way_store.cpp
+++ b/src/sorted_way_store.cpp
@@ -1,4 +1,3 @@
-#include <atomic>
 #include <algorithm>
 #include <bitset>
 #include <cstring>
@@ -19,40 +18,56 @@ namespace SortedWayStoreTypes {
 	const uint16_t ClosedWay = 1 << 14;
 	const uint16_t UniformUpperBits = 1 << 13;
 
-	thread_local bool collectingOrphans = true;
-	thread_local uint64_t groupStart = -1;
-	thread_local std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays = NULL;
+	struct ThreadStorage {
+		ThreadStorage():
+			collectingOrphans(true),
+			groupStart(-1),
+			localWays(nullptr) {}
 
-	thread_local std::vector<uint8_t> encodedWay;
+		bool collectingOrphans;
+		uint64_t groupStart;
+		std::vector<std::pair<WayID, std::vector<NodeID>>>* localWays;
+		std::vector<uint8_t> encodedWay;
+	};
+
+	thread_local std::deque<std::pair<const SortedWayStore*, ThreadStorage>> threadStorage;
+
+	inline ThreadStorage& s(const SortedWayStore* who) {
+		for (auto& entry : threadStorage)
+			if (entry.first == who)
+				return entry.second;
+
+		threadStorage.push_back(std::make_pair(who, ThreadStorage()));
+
+		auto& rv = threadStorage.back();
+		return rv.second;
+	}
 
 	// C++ doesn't support variable length arrays declared on stack.
 	// g++ and clang support it, but msvc doesn't. Rather than pay the
 	// cost of a vector for every decode, we use a thread_local with room for at
 	// least 2,000 nodes.
+	//
+	// Note: these are scratch buffers, so they remain as true thread-locals,
+	// and aren't part of ThreadStorage.
 	thread_local uint64_t highBytes[2000];
 	thread_local uint32_t uint32Buffer[2000];
 	thread_local int32_t int32Buffer[2000];
 	thread_local uint8_t uint8Buffer[8192];
-
-	std::atomic<uint64_t> totalWays;
-	std::atomic<uint64_t> totalNodes;
-	std::atomic<uint64_t> totalGroups;
-	std::atomic<uint64_t> totalGroupSpace;
-	std::atomic<uint64_t> totalChunks;
 }
 
 using namespace SortedWayStoreTypes;
 
 SortedWayStore::SortedWayStore(bool compressWays, const NodeStore& nodeStore): compressWays(compressWays), nodeStore(nodeStore) {
-	// Each group can store 64K ways. If we allocate 32K slots,
-	// we support 2^31 = 2B ways, or about twice the number used
-	// by OSM as of December 2023.
-	groups.resize(32 * 1024);
+	s(this); // allocate our ThreadStorage before multi-threading
+	reopen();
 }
 
 SortedWayStore::~SortedWayStore() {
 	for (const auto entry: allocatedMemory)
 		void_mmap_allocator::deallocate(entry.first, entry.second);
+
+	s(this) = ThreadStorage();
 }
 
 void SortedWayStore::reopen() {
@@ -67,11 +82,64 @@ void SortedWayStore::reopen() {
 	totalChunks = 0;
 	orphanage.clear();
 	workerBuffers.clear();
+
+	// Each group can store 64K ways. If we allocate 32K slots,
+	// we support 2^31 = 2B ways, or about twice the number used
+	// by OSM as of December 2023.
 	groups.clear();
-	groups.resize(256 * 1024);
+	groups.resize(32 * 1024);
 
 }
 
+bool SortedWayStore::contains(size_t shard, WayID id) const {
+	const size_t groupIndex = id / (GroupSize * ChunkSize);
+	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
+	const uint64_t chunkMaskByte = chunk / 8;
+	const uint64_t chunkMaskBit = chunk % 8;
+
+	const uint64_t wayMaskByte = (id % ChunkSize) / 8;
+	const uint64_t wayMaskBit = id % 8;
+
+	GroupInfo* groupPtr = groups[groupIndex];
+
+	if (groupPtr == nullptr)
+		return false;
+
+	size_t chunkOffset = 0;
+	{
+		chunkOffset = popcnt(groupPtr->chunkMask, chunkMaskByte);
+		uint8_t maskByte = groupPtr->chunkMask[chunkMaskByte];
+		maskByte = maskByte & ((1 << chunkMaskBit) - 1);
+		chunkOffset += popcnt(&maskByte, 1);
+
+		if (!(groupPtr->chunkMask[chunkMaskByte] & (1 << chunkMaskBit)))
+			return false;
+	}
+
+	ChunkInfo* chunkPtr = (ChunkInfo*)((char*)groupPtr + groupPtr->chunkOffsets[chunkOffset]);
+
+	{
+		size_t wayOffset = 0;
+		wayOffset = popcnt(chunkPtr->smallWayMask, wayMaskByte);
+		uint8_t maskByte = chunkPtr->smallWayMask[wayMaskByte];
+		maskByte = maskByte & ((1 << wayMaskBit) - 1);
+		wayOffset += popcnt(&maskByte, 1);
+		if (chunkPtr->smallWayMask[wayMaskByte] & (1 << wayMaskBit))
+			return true;
+	}
+
+	size_t wayOffset = 0;
+	wayOffset += popcnt(chunkPtr->smallWayMask, 32);
+	wayOffset += popcnt(chunkPtr->bigWayMask, wayMaskByte);
+	uint8_t maskByte = chunkPtr->bigWayMask[wayMaskByte];
+	maskByte = maskByte & ((1 << wayMaskBit) - 1);
+	wayOffset += popcnt(&maskByte, 1);
+	if (!(chunkPtr->bigWayMask[wayMaskByte] & (1 << wayMaskBit)))
+		return false;
+
+	return true;
+}
+
 std::vector<LatpLon> SortedWayStore::at(WayID id) const {
 	const size_t groupIndex = id / (GroupSize * ChunkSize);
 	const size_t chunk = (id % (GroupSize * ChunkSize)) / ChunkSize;
@@ -140,52 +208,53 @@ void SortedWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &newWays
 	throw std::runtime_error("SortedWayStore does not support insertLatpLons");
 }
 
-const void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
-	// read_pbf can call with an empty array if the only ways it read were unable to
+void SortedWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+	// pbf_processor can call with an empty array if the only ways it read were unable to
 	// be processed due to missing nodes, so be robust against empty way vector.
 	if (newWays.empty())
 		return;
 
-	if (localWays == nullptr) {
+	ThreadStorage& tls = s(this);
+	if (tls.localWays == nullptr) {
 		std::lock_guard<std::mutex> lock(orphanageMutex);
 		if (workerBuffers.size() == 0)
 			workerBuffers.reserve(256);
 		else if (workerBuffers.size() == workerBuffers.capacity())
 			throw std::runtime_error("SortedWayStore doesn't support more than 256 cores");
 		workerBuffers.push_back(std::vector<std::pair<WayID, std::vector<NodeID>>>());
-		localWays = &workerBuffers.back();
+		tls.localWays = &workerBuffers.back();
 	}
 
-	if (groupStart == -1) {
+	if (tls.groupStart == -1) {
 		// Mark where the first full group starts, so we know when to transition
 		// out of collecting orphans.
-		groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		tls.groupStart = newWays[0].first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 	}
 
 	int i = 0;
-	while (collectingOrphans && i < newWays.size()) {
+	while (tls.collectingOrphans && i < newWays.size()) {
 		const auto& el = newWays[i];
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			collectingOrphans = false;
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			tls.collectingOrphans = false;
 			// Calculate new groupStart, rounding to previous boundary.
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
-			collectOrphans(*localWays);
-			localWays->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+			collectOrphans(*tls.localWays);
+			tls.localWays->clear();
 		}
-		localWays->push_back(el);
+		tls.localWays->push_back(el);
 		i++;
 	}
 
 	while(i < newWays.size()) {
 		const auto& el = newWays[i];
 
-		if (el.first >= groupStart + (GroupSize * ChunkSize)) {
-			publishGroup(*localWays);
-			localWays->clear();
-			groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
+		if (el.first >= tls.groupStart + (GroupSize * ChunkSize)) {
+			publishGroup(*tls.localWays);
+			tls.localWays->clear();
+			tls.groupStart = el.first / (GroupSize * ChunkSize) * (GroupSize * ChunkSize);
 		}
 
-		localWays->push_back(el);
+		tls.localWays->push_back(el);
 		i++;
 	}
 }
@@ -229,13 +298,14 @@ void SortedWayStore::finalize(unsigned int threadNum) {
 }
 
 void SortedWayStore::batchStart() {
-	collectingOrphans = true;
-	groupStart = -1;
-	if (localWays == nullptr || localWays->size() == 0)
+	ThreadStorage& tls = s(this);
+	tls.collectingOrphans = true;
+	tls.groupStart = -1;
+	if (tls.localWays == nullptr || tls.localWays->size() == 0)
 		return;
 
-	collectOrphans(*localWays);
-	localWays->clear();
+	collectOrphans(*tls.localWays);
+	tls.localWays->clear();
 }
 
 void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vector<NodeID>>>& orphans) {
@@ -244,6 +314,7 @@ void SortedWayStore::collectOrphans(const std::vector<std::pair<WayID, std::vect
 
 	std::vector<std::pair<WayID, std::vector<NodeID>>>& vec = orphanage[groupIndex];
 	const size_t i = vec.size();
+
 	vec.resize(i + orphans.size());
 	std::copy(orphans.begin(), orphans.end(), vec.begin() + i);
 }
@@ -284,7 +355,6 @@ std::vector<NodeID> SortedWayStore::decodeWay(uint16_t flags, const uint8_t* inp
 		for (int i = 0; i < length; i++)
 			rv.push_back(highBytes[i] | lowIntData[i]);
 	} else {
-		uint16_t compressedLength = *(uint16_t*)input;
 		input += 2;
 
 		uint32_t firstInt = *(uint32_t*)(input);
@@ -408,6 +478,7 @@ void populateMask(uint8_t* mask, const std::vector<uint8_t>& ids) {
 }
 
 void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector<NodeID>>>& ways) {
+	ThreadStorage& tls = s(this);
 	totalWays += ways.size();
 	if (ways.size() == 0) {
 		throw std::runtime_error("SortedWayStore: group is empty");
@@ -451,12 +522,12 @@ void SortedWayStore::publishGroup(const std::vector<std::pair<WayID, std::vector
 		const WayID id = way.first;
 		lastChunk->wayIds.push_back(id % ChunkSize);
 
-		uint16_t flags = encodeWay(way.second, encodedWay, compressWays && way.second.size() >= 4);
+		uint16_t flags = encodeWay(way.second, tls.encodedWay, compressWays && way.second.size() >= 4);
 		lastChunk->wayFlags.push_back(flags);
 
 		std::vector<uint8_t> encoded;
-		encoded.resize(encodedWay.size());
-		memcpy(encoded.data(), encodedWay.data(), encodedWay.size());
+		encoded.resize(tls.encodedWay.size());
+		memcpy(encoded.data(), tls.encodedWay.data(), tls.encodedWay.size());
 
 		lastChunk->encodedWays.push_back(std::move(encoded));
 	}
diff --git a/src/tag_map.cpp b/src/tag_map.cpp
new file mode 100644
index 00000000..8fc02a96
--- /dev/null
+++ b/src/tag_map.cpp
@@ -0,0 +1,135 @@
+#include "tag_map.h"
+#include <boost/functional/hash.hpp>
+#include <iostream>
+
+TagMap::TagMap() {
+	keys.resize(16);
+	key2value.resize(16);
+	values.resize(16);
+}
+
+void TagMap::reset() {
+	for (int i = 0; i < 16; i++) {
+		keys[i].clear();
+		key2value[i].clear();
+		values[i].clear();
+	}
+}
+
+const std::size_t hashString(const std::string& str) {
+	// This is a pretty crappy hash function in terms of bit
+	// avalanching and distribution of output values.
+	//
+	// But it's very good in terms of speed, which turns out
+	// to be the important measure.
+	std::size_t hash = str.size();
+	if (hash >= 4)
+		hash ^= *(uint32_t*)str.data();
+
+	return hash;
+}
+
+const std::size_t hashString(const char* str, size_t size) {
+	// This is a pretty crappy hash function in terms of bit
+	// avalanching and distribution of output values.
+	//
+	// But it's very good in terms of speed, which turns out
+	// to be the important measure.
+	std::size_t hash = size;
+	if (hash >= 4)
+		hash ^= *(uint32_t*)str;
+
+	return hash;
+}
+
+uint32_t TagMap::ensureString(
+	std::vector<std::vector<const protozero::data_view*>>& vector,
+	const protozero::data_view& value
+) {
+	std::size_t hash = hashString(value.data(), value.size());
+
+	const uint16_t shard = hash % vector.size();
+	for (int i = 0; i < vector[shard].size(); i++)
+		if (*(vector[shard][i]) == value)
+			return shard << 16 | i;
+
+	vector[shard].push_back(&value);
+	return shard << 16 | (vector[shard].size() - 1);
+}
+
+
+void TagMap::addTag(const protozero::data_view& key, const protozero::data_view& value) {
+	uint32_t valueLoc = ensureString(values, value);
+//	std::cout << "valueLoc = " << valueLoc << std::endl;
+	uint32_t keyLoc = ensureString(keys, key);
+//	std::cout << "keyLoc = " << keyLoc << std::endl;
+
+
+	const uint16_t shard = keyLoc >> 16;
+	const uint16_t pos = keyLoc;
+//	std::cout << "shard=" << shard << ", pos=" << pos << std::endl;
+	if (key2value[shard].size() <= pos) {
+//		std::cout << "growing shard" << std::endl;
+		key2value[shard].resize(pos + 1);
+	}
+
+	key2value[shard][pos] = valueLoc;
+}
+
+int64_t TagMap::getKey(const char* key, size_t size) const {
+	// Return -1 if key not found, else return its keyLoc.
+	std::size_t hash = hashString(key, size);
+
+	const uint16_t shard = hash % keys.size();
+	for (int i = 0; i < keys[shard].size(); i++) {
+		const protozero::data_view& candidate = *keys[shard][i];
+	  if (candidate.size() != size)
+			continue;
+
+		if (memcmp(candidate.data(), key, size) == 0)
+			return shard << 16 | i;
+	}
+
+	return -1;
+}
+
+int64_t TagMap::getValue(const char* value, size_t size) const {
+	// Return -1 if value not found, else return its valueLoc.
+	std::size_t hash = hashString(value, size);
+
+	const uint16_t shard = hash % values.size();
+	for (int i = 0; i < values[shard].size(); i++) {
+		const protozero::data_view& candidate = *values[shard][i];
+	  if (candidate.size() != size)
+			continue;
+
+		if (memcmp(candidate.data(), value, size) == 0)
+			return shard << 16 | i;
+	}
+
+	return -1;
+}
+
+const protozero::data_view* TagMap::getValueFromKey(uint32_t keyLoc) const {
+	const uint32_t valueLoc = key2value[keyLoc >> 16][keyLoc & 0xFFFF];
+	return values[valueLoc >> 16][valueLoc & 0xFFFF];
+}
+
+const protozero::data_view* TagMap::getValue(uint32_t valueLoc) const {
+	return values[valueLoc >> 16][valueLoc & 0xFFFF];
+}
+
+boost::container::flat_map<std::string, std::string> TagMap::exportToBoostMap() const {
+	boost::container::flat_map<std::string, std::string> rv;
+
+	for (int i = 0; i < keys.size(); i++) {
+		for (int j = 0; j < keys[i].size(); j++) {
+			uint32_t valueLoc = key2value[i][j];
+			auto key = *keys[i][j];
+			auto value = *values[valueLoc >> 16][valueLoc & 0xFFFF];
+			rv[std::string(key.data(), key.size())] = std::string(value.data(), value.size());
+		}
+	}
+
+	return rv;
+}
diff --git a/src/tile_data.cpp b/src/tile_data.cpp
index 696ed333..407f534a 100644
--- a/src/tile_data.cpp
+++ b/src/tile_data.cpp
@@ -47,12 +47,14 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 	z6OffsetDivisor(baseZoom >= CLUSTER_ZOOM ? (1 << (baseZoom - CLUSTER_ZOOM)) : 1),
 	objectsMutex(threadNum * 4),
 	objects(CLUSTER_ZOOM_AREA),
+	lowZoomObjects(CLUSTER_ZOOM_AREA),
 	objectsWithIds(CLUSTER_ZOOM_AREA),
+	lowZoomObjectsWithIds(CLUSTER_ZOOM_AREA),
 	baseZoom(baseZoom),
 	pointStores(threadNum),
 	linestringStores(threadNum),
-	multipolygonStores(threadNum),
 	multilinestringStores(threadNum),
+	multipolygonStores(threadNum),
 	multiPolygonClipCache(ClipCache<MultiPolygon>(threadNum, baseZoom)),
 	multiLinestringClipCache(ClipCache<MultiLinestring>(threadNum, baseZoom))
 {
@@ -71,9 +73,21 @@ TileDataSource::TileDataSource(size_t threadNum, unsigned int baseZoom, bool inc
 	}
 }
 
+thread_local std::vector<std::tuple<TileCoordinates, OutputObject, uint64_t>>* tlsPendingSmallIndexObjects = nullptr;
+
 void TileDataSource::finalize(size_t threadNum) {
-	finalizeObjects<OutputObjectXY>(threadNum, baseZoom, objects.begin(), objects.end());
-	finalizeObjects<OutputObjectXYID>(threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end());
+	uint64_t finalized = 0;
+	for (const auto& vec : pendingSmallIndexObjects) {
+		for (const auto& tuple : vec) {
+			finalized++;
+			addObjectToSmallIndexUnsafe(std::get<0>(tuple), std::get<1>(tuple), std::get<2>(tuple));
+		}
+	}
+
+	std::cout << "indexed " << finalized << " contended objects" << std::endl;
+
+	finalizeObjects<OutputObjectXY>(name(), threadNum, baseZoom, objects.begin(), objects.end(), lowZoomObjects);
+	finalizeObjects<OutputObjectXYID>(name(), threadNum, baseZoom, objectsWithIds.begin(), objectsWithIds.end(), lowZoomObjectsWithIds);
 }
 
 void TileDataSource::addObjectToSmallIndex(const TileCoordinates& index, const OutputObject& oo, uint64_t id) {
@@ -87,8 +101,28 @@ void TileDataSource::addObjectToSmallIndex(const TileCoordinates& index, const O
 	}
 
 	const size_t z6index = z6x * CLUSTER_ZOOM_WIDTH + z6y;
+	auto& mutex = objectsMutex[z6index % objectsMutex.size()];
+
+	if (mutex.try_lock()) {
+		addObjectToSmallIndexUnsafe(index, oo, id);
+		mutex.unlock();
+	} else {
+		// add to tlsPendingSmallIndexObjects
+		if (tlsPendingSmallIndexObjects == nullptr) {
+			std::lock_guard<std::mutex> lock(objectsMutex[0]);
+			pendingSmallIndexObjects.push_back(std::vector<std::tuple<TileCoordinates, OutputObject, uint64_t>>());
+			tlsPendingSmallIndexObjects = &pendingSmallIndexObjects.back();
+		}
 
-	std::lock_guard<std::mutex> lock(objectsMutex[z6index % objectsMutex.size()]);
+		tlsPendingSmallIndexObjects->push_back(std::make_tuple(index, oo, id));
+	}
+}
+
+void TileDataSource::addObjectToSmallIndexUnsafe(const TileCoordinates& index, const OutputObject& oo, uint64_t id) {
+	// Pick the z6 index
+	const size_t z6x = index.x / z6OffsetDivisor;
+	const size_t z6y = index.y / z6OffsetDivisor;
+	const size_t z6index = z6x * CLUSTER_ZOOM_WIDTH + z6y;
 
 	if (id == 0 || !includeID)
 		objects[z6index].push_back({
@@ -105,32 +139,39 @@ void TileDataSource::addObjectToSmallIndex(const TileCoordinates& index, const O
 		});
 }
 
-void TileDataSource::collectTilesWithObjectsAtZoom(uint zoom, TileCoordinatesSet& output) {
+void TileDataSource::collectTilesWithObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms) {
 	// Scan through all shards. Convert to base zoom, then convert to the requested zoom.
-	collectTilesWithObjectsAtZoomTemplate<OutputObjectXY>(baseZoom, objects.begin(), objects.size(), zoom, output);
-	collectTilesWithObjectsAtZoomTemplate<OutputObjectXYID>(baseZoom, objectsWithIds.begin(), objectsWithIds.size(), zoom, output);
+	collectTilesWithObjectsAtZoomTemplate<OutputObjectXY>(baseZoom, objects.begin(), objects.size(), zooms);
+	collectTilesWithObjectsAtZoomTemplate<OutputObjectXYID>(baseZoom, objectsWithIds.begin(), objectsWithIds.size(), zooms);
 }
 
-void addCoveredTilesToOutput(const uint baseZoom, const uint zoom, const Box& box, TileCoordinatesSet& output) {
-	int scale = pow(2, baseZoom-zoom);
+void addCoveredTilesToOutput(const uint baseZoom, std::vector<TileCoordinatesSet>& zooms, const Box& box) {
+	size_t maxZoom = zooms.size() - 1;
+	int scale = pow(2, baseZoom - maxZoom);
 	TileCoordinate minx = box.min_corner().x() / scale;
 	TileCoordinate maxx = box.max_corner().x() / scale;
 	TileCoordinate miny = box.min_corner().y() / scale;
 	TileCoordinate maxy = box.max_corner().y() / scale;
 	for (int x=minx; x<=maxx; x++) {
 		for (int y=miny; y<=maxy; y++) {
-			output.set(x, y);
+			size_t zx = x, zy = y;
+
+			for (int zoom = maxZoom; zoom >= 0; zoom--) {
+				zooms[zoom].set(zx, zy);
+				zx /= 2;
+				zy /= 2;
+			}
 		}
 	}
 }
 
 // Find the tiles used by the "large objects" from the rtree index
-void TileDataSource::collectTilesWithLargeObjectsAtZoom(uint zoom, TileCoordinatesSet &output) {
+void TileDataSource::collectTilesWithLargeObjectsAtZoom(std::vector<TileCoordinatesSet>& zooms) {
 	for(auto const &result: boxRtree)
-		addCoveredTilesToOutput(baseZoom, zoom, result.first, output);
+		addCoveredTilesToOutput(baseZoom, zooms, result.first);
 
 	for(auto const &result: boxRtreeWithIds)
-		addCoveredTilesToOutput(baseZoom, zoom, result.first, output);
+		addCoveredTilesToOutput(baseZoom, zooms, result.first);
 }
 
 // Copy objects from the tile at dstIndex (in the dataset srcTiles) into output
@@ -139,11 +180,15 @@ void TileDataSource::collectObjectsForTile(
 	TileCoordinates dstIndex,
 	std::vector<OutputObjectID>& output
 ) {
+	if (zoom < CLUSTER_ZOOM) {
+		collectLowZoomObjectsForTile<OutputObjectXY>(baseZoom, lowZoomObjects, zoom, dstIndex, output);
+		collectLowZoomObjectsForTile<OutputObjectXYID>(baseZoom, lowZoomObjectsWithIds, zoom, dstIndex, output);
+		return;
+	}
+
 	size_t iStart = 0;
 	size_t iEnd = objects.size();
 
-	// TODO: we could also narrow the search space for z1..z5, too.
-	//       They're less important, as they have fewer tiles.
 	if (zoom >= CLUSTER_ZOOM) {
 		// Compute the x, y at the base zoom level
 		TileCoordinate z6x = dstIndex.x / (1 << (zoom - CLUSTER_ZOOM));
@@ -188,11 +233,7 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
                                           NodeID const objectID, const TileBbox &bbox) {
 	switch(geomType) {
 		case POINT_: {
-			auto p = retrievePoint(objectID);
-			if (geom::within(p, bbox.clippingBox)) {
-				return p;
-			} 
-			return MultiLinestring();
+			throw std::runtime_error("unexpected geomType in buildWayGeometry");
 		}
 
 		case LINESTRING_: {
@@ -329,22 +370,12 @@ Geometry TileDataSource::buildWayGeometry(OutputGeometryType const geomType,
 	}
 }
 
-LatpLon TileDataSource::buildNodeGeometry(OutputGeometryType const geomType, 
-                                          NodeID const objectID, const TileBbox &bbox) const {
-	switch(geomType) {
-		case POINT_: {
-			auto p = retrievePoint(objectID);
-			LatpLon out;
-			out.latp = p.y();
-			out.lon  = p.x();
-			return out;
-		}
-
-		default:
-			break;
-	}
-
-	throw std::runtime_error("Geometry type is not point");			
+LatpLon TileDataSource::buildNodeGeometry(NodeID const objectID, const TileBbox &bbox) const {
+	auto p = retrievePoint(objectID);
+	LatpLon out;
+	out.latp = p.y();
+	out.lon  = p.x();
+	return out;
 }
 
 
@@ -366,18 +397,14 @@ void TileDataSource::reportSize() const {
 	std::cout << "Generated points: " << (points - 1) << ", lines: " << (linestrings - 2) << ", polygons: " << (polygons - 1) << std::endl;
 }
 
-TileCoordinatesSet getTilesAtZoom(
+void populateTilesAtZoom(
 	const std::vector<class TileDataSource *>& sources,
-	unsigned int zoom
+	std::vector<TileCoordinatesSet>& zooms
 ) {
-	TileCoordinatesSet tileCoordinates(zoom);
-
 	for(size_t i=0; i<sources.size(); i++) {
-		sources[i]->collectTilesWithObjectsAtZoom(zoom, tileCoordinates);
-		sources[i]->collectTilesWithLargeObjectsAtZoom(zoom, tileCoordinates);
+		sources[i]->collectTilesWithObjectsAtZoom(zooms);
+		sources[i]->collectTilesWithLargeObjectsAtZoom(zooms);
 	}
-
-	return tileCoordinates;
 }
 
 std::vector<OutputObjectID> TileDataSource::getObjectsForTile(
@@ -532,7 +559,7 @@ NodeID TileDataSource::storePoint(const Point& input) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(input);
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -542,7 +569,7 @@ NodeID TileDataSource::storeLinestring(const Linestring& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -564,7 +591,7 @@ NodeID TileDataSource::storeMultiPolygon(const MultiPolygon& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
@@ -579,7 +606,7 @@ NodeID TileDataSource::storeMultiLinestring(const MultiLinestring& src) {
 
 	NodeID offset = store.second->size();
 	store.second->emplace_back(std::move(dst));
-	NodeID rv = (store.first << (35 - shardBits)) + offset;
+	NodeID rv = (store.first << (TILE_DATA_ID_SIZE - shardBits)) + offset;
 	return rv;
 }
 
diff --git a/src/tile_worker.cpp b/src/tile_worker.cpp
index 5f5c48b2..7951fcaf 100644
--- a/src/tile_worker.cpp
+++ b/src/tile_worker.cpp
@@ -176,7 +176,7 @@ void ProcessObjects(
 
 		if (oo.oo.geomType == POINT_) {
 			vector_tile::Tile_Feature *featurePtr = vtLayer->add_features();
-			LatpLon pos = source->buildNodeGeometry(oo.oo.geomType, oo.oo.objectID, bbox);
+			LatpLon pos = source->buildNodeGeometry(oo.oo.objectID, bbox);
 			featurePtr->add_geometry(9);					// moveTo, repeat x1
 			pair<int,int> xy = bbox.scaleLatpLon(pos.latp/10000000.0, pos.lon/10000000.0);
 			featurePtr->add_geometry((xy.first  << 1) ^ (xy.first  >> 31));
@@ -378,13 +378,13 @@ void outputProc(
 
 	// Write to file or sqlite
 	string outputdata, compressed;
-	if (sharedData.outputMode == OUTPUT_MBTILES) {
+	if (sharedData.outputMode == OptionsParser::OutputMode::MBTiles) {
 		// Write to sqlite
 		tile.SerializeToString(&outputdata);
 		if (sharedData.config.compress) { compressed = compress_string(outputdata, Z_DEFAULT_COMPRESSION, sharedData.config.gzip); }
 		sharedData.mbtiles.saveTile(zoom, bbox.index.x, bbox.index.y, sharedData.config.compress ? &compressed : &outputdata, sharedData.mergeSqlite);
 
-	} else if (sharedData.outputMode == OUTPUT_PMTILES) {
+	} else if (sharedData.outputMode == OptionsParser::OutputMode::PMTiles) {
 		// Write to pmtiles
 		tile.SerializeToString(&outputdata);
 		sharedData.pmtiles.saveTile(zoom, bbox.index.x, bbox.index.y, outputdata);
diff --git a/src/tilemaker.cpp b/src/tilemaker.cpp
index 852be49b..3c3f55fe 100644
--- a/src/tilemaker.cpp
+++ b/src/tilemaker.cpp
@@ -48,8 +48,9 @@
 #include "osm_lua_processing.h"
 #include "mbtiles.h"
 
+#include "options_parser.h"
 #include "shared_data.h"
-#include "read_pbf.h"
+#include "pbf_processor.h"
 #include "read_shp.h"
 #include "tile_worker.h"
 #include "osm_mem_tiles.h"
@@ -80,89 +81,46 @@ bool verbose = false;
  *
  * Worker threads write the output tiles, and start in the outputProc function.
  */
-int main(int argc, char* argv[]) {
-
+int main(const int argc, const char* argv[]) {
 	// ----	Read command-line options
-	vector<string> inputFiles;
-	string luaFile;
-	string osmStoreFile;
-	string jsonFile;
-	uint threadNum;
-	string outputFile;
-	string bbox;
-	bool _verbose = false, mergeSqlite = false, mapsplit = false, osmStoreCompact = false, skipIntegrity = false, osmStoreUncompressedNodes = false, osmStoreUncompressedWays = false, materializeGeometries = false;
-	int outputMode = OUTPUT_FILE;
-	bool logTileTimings = false;
-
-	po::options_description desc("tilemaker " STR(TM_VERSION) "\nConvert OpenStreetMap .pbf files into vector tiles\n\nAvailable options");
-	desc.add_options()
-		("help",                                                                 "show help message")
-		("input",  po::value< vector<string> >(&inputFiles),                     "source .osm.pbf file")
-		("output", po::value< string >(&outputFile),                             "target directory or .mbtiles/.pmtiles file")
-		("bbox",   po::value< string >(&bbox),                                   "bounding box to use if input file does not have a bbox header set, example: minlon,minlat,maxlon,maxlat")
-		("merge"  ,po::bool_switch(&mergeSqlite),                                "merge with existing .mbtiles (overwrites otherwise)")
-		("config", po::value< string >(&jsonFile)->default_value("config.json"), "config JSON file")
-		("process",po::value< string >(&luaFile)->default_value("process.lua"),  "tag-processing Lua file")
-		("store",  po::value< string >(&osmStoreFile),  "temporary storage for node/ways/relations data")
-		("compact",po::bool_switch(&osmStoreCompact),  "Reduce overall memory usage (compact mode).\nNOTE: This requires the input to be renumbered (osmium renumber)")
-		("no-compress-nodes", po::bool_switch(&osmStoreUncompressedNodes),  "Store nodes uncompressed")
-		("no-compress-ways", po::bool_switch(&osmStoreUncompressedWays),  "Store ways uncompressed")
-		("materialize-geometries", po::bool_switch(&materializeGeometries),  "Materialize geometries - faster, but requires more memory")
-		("verbose",po::bool_switch(&_verbose),                                   "verbose error output")
-		("skip-integrity",po::bool_switch(&skipIntegrity),                       "don't enforce way/node integrity")
-		("log-tile-timings", po::bool_switch(&logTileTimings), "log how long each tile takes")
-		("threads",po::value< uint >(&threadNum)->default_value(0),              "number of threads (automatically detected if 0)");
-	po::positional_options_description p;
-	p.add("input", 1).add("output", 1);
-	po::variables_map vm;
+	OptionsParser::Options options;
 	try {
-		po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
-	} catch (const po::unknown_option& ex) {
-		cerr << "Unknown option: " << ex.get_option_name() << endl;
-		return -1;
+		options = OptionsParser::parse(argc, argv);
+	} catch (OptionsParser::OptionException& e) {
+		cerr << e.what() << endl;
+		return 1;
 	}
-	po::notify(vm);
-	
-	if (vm.count("help")) { cout << desc << endl; return 0; }
-	if (vm.count("output")==0) { cerr << "You must specify an output file or directory. Run with --help to find out more." << endl; return -1; }
-	if (vm.count("input")==0) { cout << "No source .osm.pbf file supplied" << endl; }
 
-	vector<string> bboxElements = parseBox(bbox);
+	if (options.showHelp) { OptionsParser::showHelp(); return 0; }
 
-	if (ends_with(outputFile, ".mbtiles") || ends_with(outputFile, ".sqlite")) { outputMode = OUTPUT_MBTILES; }
-	else if (ends_with(outputFile, ".pmtiles")) { outputMode = OUTPUT_PMTILES; }
-	if (threadNum == 0) { threadNum = max(thread::hardware_concurrency(), 1u); }
-	verbose = _verbose;
+	verbose = options.verbose;
 
-
-	// ---- Check config
-	
-	if (!boost::filesystem::exists(jsonFile)) { cerr << "Couldn't open .json config: " << jsonFile << endl; return -1; }
-	if (!boost::filesystem::exists(luaFile )) { cerr << "Couldn't open .lua script: "  << luaFile  << endl; return -1; }
+	vector<string> bboxElements = parseBox(options.bbox);
 
 	// ---- Remove existing .mbtiles if it exists
-
-	if ((outputMode==OUTPUT_MBTILES || outputMode==OUTPUT_PMTILES) && !mergeSqlite && static_cast<bool>(std::ifstream(outputFile))) {
+	if ((options.outputMode == OptionsParser::OutputMode::MBTiles || options.outputMode == OptionsParser::OutputMode::PMTiles) && !options.mergeSqlite && static_cast<bool>(std::ifstream(options.outputFile))) {
 		cout << "Output file exists, will overwrite (Ctrl-C to abort";
-		if (outputMode==OUTPUT_MBTILES) cout << ", rerun with --merge to keep";
+		if (options.outputMode == OptionsParser::OutputMode::MBTiles) cout << ", rerun with --merge to keep";
 		cout << ")" << endl;
 		std::this_thread::sleep_for(std::chrono::milliseconds(2000));
-		if (remove(outputFile.c_str()) != 0) {
+		if (remove(options.outputFile.c_str()) != 0) {
 			cerr << "Couldn't remove existing file" << endl;
 			return 0;
 		}
-	} else if (mergeSqlite && outputMode!=OUTPUT_MBTILES) {
+	} else if (options.mergeSqlite && options.outputMode != OptionsParser::OutputMode::MBTiles) {
 		cerr << "--merge only works with .mbtiles" << endl;
 		return 0;
-	} else if (mergeSqlite && !static_cast<bool>(std::ifstream(outputFile))) {
+	} else if (options.mergeSqlite && !static_cast<bool>(std::ifstream(options.outputFile))) {
 		cout << "--merge specified but .mbtiles file doesn't already exist, ignoring" << endl;
-		mergeSqlite = false;
+		options.mergeSqlite = false;
 	}
 
+
 	// ----	Read bounding box from first .pbf (if there is one) or mapsplit file
 
 	bool hasClippingBox = false;
 	Box clippingBox;
+	bool mapsplit = false;
 	MBTiles mapsplitFile;
 	double minLon=0.0, maxLon=0.0, minLat=0.0, maxLat=0.0;
 	if (!bboxElements.empty()) {
@@ -172,14 +130,14 @@ int main(int argc, char* argv[]) {
 		maxLon = bboxElementFromStr(bboxElements.at(2));
 		maxLat = bboxElementFromStr(bboxElements.at(3));
 
-	} else if (inputFiles.size()==1 && (ends_with(inputFiles[0], ".mbtiles") || ends_with(inputFiles[0], ".sqlite") || ends_with(inputFiles[0], ".msf"))) {
+	} else if (options.inputFiles.size()==1 && (ends_with(options.inputFiles[0], ".mbtiles") || ends_with(options.inputFiles[0], ".sqlite") || ends_with(options.inputFiles[0], ".msf"))) {
 		mapsplit = true;
-		mapsplitFile.openForReading(inputFiles[0]);
+		mapsplitFile.openForReading(options.inputFiles[0]);
 		mapsplitFile.readBoundingBox(minLon, maxLon, minLat, maxLat);
 		hasClippingBox = true;
 
-	} else if (inputFiles.size()>0) {
-		int ret = ReadPbfBoundingBox(inputFiles[0], minLon, maxLon, minLat, maxLat, hasClippingBox);
+	} else if (options.inputFiles.size()>0) {
+		int ret = ReadPbfBoundingBox(options.inputFiles[0], minLon, maxLon, minLat, maxLat, hasClippingBox);
 		if(ret != 0) return ret;
 	}
 
@@ -193,7 +151,7 @@ int main(int argc, char* argv[]) {
 	rapidjson::Document jsonConfig;
 	class Config config;
 	try {
-		FILE* fp = fopen(jsonFile.c_str(), "r");
+		FILE* fp = fopen(options.jsonFile.c_str(), "r");
 		char readBuffer[65536];
 		rapidjson::FileReadStream is(fp, readBuffer, sizeof(readBuffer));
 		jsonConfig.ParseStream(is);
@@ -211,52 +169,73 @@ int main(int argc, char* argv[]) {
 	}
 
 	// For each tile, objects to be used in processing
-	shared_ptr<NodeStore> nodeStore;
-
 	bool allPbfsHaveSortTypeThenID = true;
 	bool anyPbfHasLocationsOnWays = false;
 
-	for (const std::string& file: inputFiles) {
+	for (const std::string& file: options.inputFiles) {
 		if (ends_with(file, ".pbf")) {
 			allPbfsHaveSortTypeThenID = allPbfsHaveSortTypeThenID && PbfHasOptionalFeature(file, OptionSortTypeThenID);
 			anyPbfHasLocationsOnWays = anyPbfHasLocationsOnWays || PbfHasOptionalFeature(file, OptionLocationsOnWays);
 		}
 	}
 
-	if (osmStoreCompact)
-		nodeStore = make_shared<CompactNodeStore>();
-	else {
-		if (allPbfsHaveSortTypeThenID)
-			nodeStore = make_shared<SortedNodeStore>(!osmStoreUncompressedNodes);
-		else
-			nodeStore = make_shared<BinarySearchNodeStore>();
+	auto createNodeStore = [allPbfsHaveSortTypeThenID, options]() {
+		if (options.osm.compact) {
+			std::shared_ptr<NodeStore> rv = make_shared<CompactNodeStore>();
+			return rv;
+		}
+
+		if (allPbfsHaveSortTypeThenID) {
+			std::shared_ptr<NodeStore> rv = make_shared<SortedNodeStore>(!options.osm.uncompressedNodes);
+			return rv;
+		}
+		std::shared_ptr<NodeStore> rv =  make_shared<BinarySearchNodeStore>();
+		return rv;
+	};
+
+	shared_ptr<NodeStore> nodeStore;
+
+	if (options.osm.shardStores) {
+		nodeStore = std::make_shared<ShardedNodeStore>(createNodeStore);
+	} else {
+		nodeStore = createNodeStore();
 	}
 
+	auto createWayStore = [anyPbfHasLocationsOnWays, allPbfsHaveSortTypeThenID, options, &nodeStore]() {
+		if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) {
+			std::shared_ptr<WayStore> rv = make_shared<SortedWayStore>(!options.osm.uncompressedWays, *nodeStore.get());
+			return rv;
+		}
+
+		std::shared_ptr<WayStore> rv = make_shared<BinarySearchWayStore>();
+		return rv;
+	};
+
 	shared_ptr<WayStore> wayStore;
-	if (!anyPbfHasLocationsOnWays && allPbfsHaveSortTypeThenID) {
-		wayStore = make_shared<SortedWayStore>(!osmStoreUncompressedNodes, *nodeStore.get());
+	if (options.osm.shardStores) {
+		wayStore = std::make_shared<ShardedWayStore>(createWayStore, *nodeStore.get());
 	} else {
-		wayStore = make_shared<BinarySearchWayStore>();
+		wayStore = createWayStore();
 	}
 
 	OSMStore osmStore(*nodeStore.get(), *wayStore.get());
-	osmStore.use_compact_store(osmStoreCompact);
-	osmStore.enforce_integrity(!skipIntegrity);
-	if(!osmStoreFile.empty()) {
-		std::cout << "Using osm store file: " << osmStoreFile << std::endl;
-		osmStore.open(osmStoreFile);
+	osmStore.use_compact_store(options.osm.compact);
+	osmStore.enforce_integrity(!options.osm.skipIntegrity);
+	if(!options.osm.storeFile.empty()) {
+		std::cout << "Using osm store file: " << options.osm.storeFile << std::endl;
+		osmStore.open(options.osm.storeFile);
 	}
 
 	AttributeStore attributeStore;
 
 	class LayerDefinition layers(config.layers);
-	class OsmMemTiles osmMemTiles(threadNum, config.baseZoom, config.includeID, *nodeStore, *wayStore);
-	class ShpMemTiles shpMemTiles(threadNum, config.baseZoom);
+	class OsmMemTiles osmMemTiles(options.threadNum, config.baseZoom, config.includeID, *nodeStore, *wayStore);
+	class ShpMemTiles shpMemTiles(options.threadNum, config.baseZoom);
 	osmMemTiles.open();
 	shpMemTiles.open();
 
-	OsmLuaProcessing osmLuaProcessing(osmStore, config, layers, luaFile, 
-		shpMemTiles, osmMemTiles, attributeStore, materializeGeometries);
+	OsmLuaProcessing osmLuaProcessing(osmStore, config, layers, options.luaFile, 
+		shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries);
 
 	// ---- Load external shp files
 
@@ -274,7 +253,7 @@ int main(int argc, char* argv[]) {
 			readShapefile(clippingBox,
 			              layers,
 			              config.baseZoom, layerNum,
-			              threadNum,
+			              options.threadNum,
 			              shpMemTiles, osmLuaProcessing);
 		}
 	}
@@ -287,28 +266,31 @@ int main(int argc, char* argv[]) {
 
 	// ----	Read all PBFs
 	
-	PbfReader pbfReader(osmStore);
+	PbfProcessor pbfProcessor(osmStore);
 	std::vector<bool> sortOrders = layers.getSortOrders();
 
 	if (!mapsplit) {
-		for (auto inputFile : inputFiles) {
+		for (auto inputFile : options.inputFiles) {
 			cout << "Reading .pbf " << inputFile << endl;
 			ifstream infile(inputFile, ios::in | ios::binary);
 			if (!infile) { cerr << "Couldn't open .pbf file " << inputFile << endl; return -1; }
 			
 			const bool hasSortTypeThenID = PbfHasOptionalFeature(inputFile, OptionSortTypeThenID);
-			int ret = pbfReader.ReadPbfFile(
+			int ret = pbfProcessor.ReadPbfFile(
+				nodeStore->shards(),
 				hasSortTypeThenID,
 				nodeKeys,
-				threadNum,
+				options.threadNum,
 				[&]() {
 					thread_local std::shared_ptr<ifstream> pbfStream(new ifstream(inputFile, ios::in | ios::binary));
 					return pbfStream;
 				},
 				[&]() {
-					thread_local std::shared_ptr<OsmLuaProcessing> osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries));
+					thread_local std::shared_ptr<OsmLuaProcessing> osmLuaProcessing(new OsmLuaProcessing(osmStore, config, layers, options.luaFile, shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries));
 					return osmLuaProcessing;
-				}
+				},
+				*nodeStore,
+				*wayStore
 			);
 			if (ret != 0) return ret;
 		} 
@@ -319,16 +301,16 @@ int main(int argc, char* argv[]) {
 	// ----	Initialise SharedData
 	SourceList sources = {&osmMemTiles, &shpMemTiles};
 	class SharedData sharedData(config, layers);
-	sharedData.outputFile = outputFile;
-	sharedData.outputMode = outputMode;
-	sharedData.mergeSqlite = mergeSqlite;
+	sharedData.outputFile = options.outputFile;
+	sharedData.outputMode = options.outputMode;
+	sharedData.mergeSqlite = options.mergeSqlite;
 
 	// ----	Initialise mbtiles/pmtiles if required
 	
-	if (sharedData.outputMode==OUTPUT_MBTILES) {
+	if (sharedData.outputMode == OptionsParser::OutputMode::MBTiles) {
 		sharedData.mbtiles.openForWriting(sharedData.outputFile);
 		sharedData.writeMBTilesProjectData();
-	} else if (sharedData.outputMode==OUTPUT_PMTILES) {
+	} else if (sharedData.outputMode == OptionsParser::OutputMode::PMTiles) {
 		sharedData.pmtiles.open(sharedData.outputFile);
 	}
 
@@ -361,7 +343,8 @@ int main(int argc, char* argv[]) {
 			cout << "Reading tile " << srcZ << ": " << srcX << "," << srcY << " (" << (run+1) << "/" << runs << ")" << endl;
 			vector<char> pbf = mapsplitFile.readTile(srcZ,srcX,tmsY);
 
-			int ret = pbfReader.ReadPbfFile(
+			int ret = pbfProcessor.ReadPbfFile(
+				nodeStore->shards(),
 				false,
 				nodeKeys,
 				1,
@@ -369,8 +352,10 @@ int main(int argc, char* argv[]) {
 					return make_unique<boost::interprocess::bufferstream>(pbf.data(), pbf.size(),  ios::in | ios::binary);
 				},
 				[&]() {
-					return std::make_unique<OsmLuaProcessing>(osmStore, config, layers, luaFile, shpMemTiles, osmMemTiles, attributeStore, materializeGeometries);
-				}
+					return std::make_unique<OsmLuaProcessing>(osmStore, config, layers, options.luaFile, shpMemTiles, osmMemTiles, attributeStore, options.osm.materializeGeometries);
+				},
+				*nodeStore,
+				*wayStore
 			);
 			if (ret != 0) return ret;
 
@@ -378,7 +363,7 @@ int main(int argc, char* argv[]) {
 		}
 
 		// Launch the pool with threadNum threads
-		boost::asio::thread_pool pool(threadNum);
+		boost::asio::thread_pool pool(options.threadNum);
 
 		// Mutex is hold when IO is performed
 		std::mutex io_mutex;
@@ -387,14 +372,14 @@ int main(int argc, char* argv[]) {
 		std::atomic<uint64_t> tilesWritten(0);
 
 		for (auto source : sources) {
-			source->finalize(threadNum);
+			source->finalize(options.threadNum);
 		}
 		// tiles by zoom level
 
 		// The clipping bbox check is expensive - as an optimization, compute the set of
 		// z6 tiles that are wholly covered by the clipping box. Membership in this
 		// set is quick to test.
-		std::set<TileCoordinates> coveredZ6Tiles;
+		TileCoordinatesSet coveredZ6Tiles(6);
 		if (hasClippingBox) {
 			for (int x = 0; x < 1 << 6; x++) {
 				for (int y = 0; y < 1 << 6; y++) {
@@ -402,20 +387,47 @@ int main(int argc, char* argv[]) {
 								TileBbox(TileCoordinates(x, y), 6, false, false).getTileBox(),
 								clippingBox
 							))
-						coveredZ6Tiles.insert(TileCoordinates(x, y));
+						coveredZ6Tiles.set(x, y);
 				}
 			}
 		}
 
 		// For large areas (arbitrarily defined as 100 z6 tiles), use a dense index for pmtiles
-		if (coveredZ6Tiles.size()>100 && outputMode==OUTPUT_PMTILES) {
+		if (coveredZ6Tiles.size()>100 && options.outputMode == OptionsParser::OutputMode::PMTiles) {
 			std::cout << "Using dense index for .pmtiles" << std::endl;
 			sharedData.pmtiles.isSparse = false;
 		}
 
 		std::deque<std::pair<unsigned int, TileCoordinates>> tileCoordinates;
+		std::vector<TileCoordinatesSet> zoomResults;
+		for (uint zoom = 0; zoom <= sharedData.config.endZoom; zoom++) {
+			zoomResults.push_back(TileCoordinatesSet(zoom));
+		}
+
+		{
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+			std::cout << "collecting tiles" << std::flush;
+			populateTilesAtZoom(sources, zoomResults);
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << ": " << (uint32_t)(tileNs / 1e6) << "ms";
+#endif
+		}
+
+		std::cout << ", filtering tiles:" << std::flush;
 		for (uint zoom=sharedData.config.startZoom; zoom <= sharedData.config.endZoom; zoom++) {
-			auto zoomResult = getTilesAtZoom(sources, zoom);
+			std::cout << " z" << std::to_string(zoom) << std::flush;
+#ifdef CLOCK_MONOTONIC
+			timespec start, end;
+			clock_gettime(CLOCK_MONOTONIC, &start);
+#endif
+
+			const auto& zoomResult = zoomResults[zoom];
+			int numTiles = 0;
 			for (int x = 0; x < 1 << zoom; x++) {
 				for (int y = 0; y < 1 << zoom; y++) {
 					if (!zoomResult.test(x, y))
@@ -433,7 +445,7 @@ int main(int argc, char* argv[]) {
 						if (zoom >= 6) {
 							TileCoordinate z6x = x / (1 << (zoom - 6));
 							TileCoordinate z6y = y / (1 << (zoom - 6));
-							isInAWhollyCoveredZ6Tile = coveredZ6Tiles.find(TileCoordinates(z6x, z6y)) != coveredZ6Tiles.end();
+							isInAWhollyCoveredZ6Tile = coveredZ6Tiles.test(z6x, z6y);
 						}
 
 						if(!isInAWhollyCoveredZ6Tile && !boost::geometry::intersects(TileBbox(TileCoordinates(x, y), zoom, false, false).getTileBox(), clippingBox)) 
@@ -441,9 +453,22 @@ int main(int argc, char* argv[]) {
 					}
 
 					tileCoordinates.push_back(std::make_pair(zoom, TileCoordinates(x, y)));
+					numTiles++;
 				}
 			}
+
+			std::cout << " (" << numTiles;
+#ifdef CLOCK_MONOTONIC
+			clock_gettime(CLOCK_MONOTONIC, &end);
+			uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
+			std::cout << ", " << (uint32_t)(tileNs / 1e6) << "ms";
+
+#endif
+			std::cout << ")" << std::flush;
 		}
+		zoomResults.clear();
+
+		std::cout << std::endl;
 
 		// Cluster tiles: breadth-first for z0..z5, depth-first for z6
 		const size_t baseZoom = config.baseZoom;
@@ -494,7 +519,7 @@ int main(int argc, char* argv[]) {
 
 				return false;
 			}, 
-			threadNum);
+			options.threadNum);
 
 		std::size_t batchSize = 0;
 		for(std::size_t startIndex = 0; startIndex < tileCoordinates.size(); startIndex += batchSize) {
@@ -523,9 +548,9 @@ int main(int argc, char* argv[]) {
 					unsigned int zoom = tileCoordinates[i].first;
 					TileCoordinates coords = tileCoordinates[i].second;
 
-#ifndef _WIN32
+#ifdef CLOCK_MONOTONIC
 					timespec start, end;
-					if (logTileTimings)
+					if (options.logTileTimings)
 						clock_gettime(CLOCK_MONOTONIC, &start);
 #endif
 
@@ -535,8 +560,8 @@ int main(int argc, char* argv[]) {
 					}
 					outputProc(sharedData, sources, attributeStore, data, coords, zoom);
 
-#ifndef _WIN32
-					if (logTileTimings) {
+#ifdef CLOCK_MONOTONIC
+					if (options.logTileTimings) {
 						clock_gettime(CLOCK_MONOTONIC, &end);
 						uint64_t tileNs = 1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
 						std::string output = "z" + std::to_string(zoom) + "/" + std::to_string(coords.x) + "/" + std::to_string(coords.y) + " took " + std::to_string(tileNs/1e6) + " ms";
@@ -545,7 +570,7 @@ int main(int argc, char* argv[]) {
 #endif
 				}
 
-				if (logTileTimings) {
+				if (options.logTileTimings) {
 					const std::lock_guard<std::mutex> lock(io_mutex);
 					std::cout << std::endl;
 					for (const auto& output : tileTimings)
@@ -575,10 +600,10 @@ int main(int argc, char* argv[]) {
 
 	// ----	Close tileset
 
-	if (outputMode==OUTPUT_MBTILES) {
+	if (options.outputMode == OptionsParser::OutputMode::MBTiles) {
 		sharedData.writeMBTilesMetadata(jsonConfig);
 		sharedData.mbtiles.closeForWriting();
-	} else if (outputMode==OUTPUT_PMTILES) {
+	} else if (options.outputMode == OptionsParser::OutputMode::PMTiles) {
 		sharedData.writePMTilesBounds();
 		std::string metadata = sharedData.pmTilesMetadata();
 		sharedData.pmtiles.close(metadata);
diff --git a/src/way_stores.cpp b/src/way_stores.cpp
index 05d884d0..790ad816 100644
--- a/src/way_stores.cpp
+++ b/src/way_stores.cpp
@@ -14,6 +14,14 @@ void BinarySearchWayStore::reopen() {
 	mLatpLonLists = std::make_unique<map_t>();
 }
 
+bool BinarySearchWayStore::contains(size_t shard, WayID id) const {
+	auto iter = std::lower_bound(mLatpLonLists->begin(), mLatpLonLists->end(), id, [](auto const &e, auto id) { 
+		return e.first < id; 
+	});
+
+	return !(iter == mLatpLonLists->end() || iter->first != id);
+}
+
 std::vector<LatpLon> BinarySearchWayStore::at(WayID wayid) const {
 	std::lock_guard<std::mutex> lock(mutex);
 	
@@ -39,7 +47,7 @@ void BinarySearchWayStore::insertLatpLons(std::vector<WayStore::ll_element_t> &n
 	std::copy(std::make_move_iterator(newWays.begin()), std::make_move_iterator(newWays.end()), mLatpLonLists->begin() + i); 
 }
 
-const void BinarySearchWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
+void BinarySearchWayStore::insertNodes(const std::vector<std::pair<WayID, std::vector<NodeID>>>& newWays) {
 	throw std::runtime_error("BinarySearchWayStore does not support insertNodes");
 }
 
diff --git a/test/append_vector.test.cpp b/test/append_vector.test.cpp
new file mode 100644
index 00000000..db4949e2
--- /dev/null
+++ b/test/append_vector.test.cpp
@@ -0,0 +1,98 @@
+#include <iostream>
+#include <boost/sort/sort.hpp>
+#include "external/minunit.h"
+#include "append_vector.h"
+
+using namespace AppendVectorNS;
+
+MU_TEST(test_append_vector) {
+	AppendVector<int32_t> vec;
+	AppendVector<int32_t> vec2;
+	mu_check(vec.size() == 0);
+	mu_check(vec.begin() == vec.end());
+	mu_check(vec.begin() != vec2.begin());
+
+	for (int i = 0; i < 10000; i++) {
+		vec.push_back(i);
+	}
+	mu_check(vec.size() == 10000);
+
+	mu_check(vec[25] == 25);
+
+	const AppendVector<int32_t>::Iterator& it = vec.begin();
+	mu_check(*it == 0);
+	mu_check(*(it + 1) == 1);
+	mu_check(*(it + 2) == 2);
+	mu_check(*(it + 9000) == 9000);
+	mu_check(*(it + 1 - 1) == 0);
+	mu_check(*(vec.end() + -1) == 9999);
+	mu_check(*(vec.end() - 1) == 9999);
+	mu_check(*(vec.end() - 2) == 9998);
+	mu_check(*(vec.end() - 9000) == 1000);
+	mu_check(*(vec.begin() - -1) == 1);
+
+	boost::sort::block_indirect_sort(
+		vec.begin(),
+		vec.end(),
+		[](auto const &a, auto const&b) { return b < a; },
+		1
+	);
+
+	mu_check(vec[0] == 9999);
+	mu_check(vec[9999] == 0);
+
+	boost::sort::block_indirect_sort(
+		vec.begin(),
+		vec.end(),
+		[](auto const &a, auto const&b) { return a < b; },
+		1
+	);
+
+	mu_check(vec[0] == 0);
+	mu_check(vec[9999] == 9999);
+
+	auto iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		123,
+		[](const int32_t& a, const int32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter != vec.end());
+	mu_check(*iter == 123);
+
+	iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		123123,
+		[](const int32_t& a, const int32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter == vec.end());
+
+	iter = std::lower_bound(
+		vec.begin(),
+		vec.end(),
+		-2,
+		[](const int32_t& a, const int32_t& toFind) {
+			return a < toFind;
+		}
+	);
+
+	mu_check(iter == vec.begin());
+}
+
+MU_TEST_SUITE(test_suite_append_vector) {
+	MU_RUN_TEST(test_append_vector);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_append_vector);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
+
diff --git a/test/attribute_store.test.cpp b/test/attribute_store.test.cpp
new file mode 100644
index 00000000..db104a14
--- /dev/null
+++ b/test/attribute_store.test.cpp
@@ -0,0 +1,103 @@
+#include <iostream>
+#include <algorithm>
+#include "external/minunit.h"
+#include "attribute_store.h"
+
+MU_TEST(test_attribute_store) {
+	AttributeStore store;
+	store.reset();
+
+	mu_check(store.size() == 0);
+
+	AttributeSet s1;
+	store.addAttribute(s1, "str1", std::string("someval"), 0);
+	store.addAttribute(s1, "str2", std::string("a very long string"), 0);
+	store.addAttribute(s1, "bool1", false, 0);
+	store.addAttribute(s1, "bool2", true, 0);
+	store.addAttribute(s1, "float1", (float)42.0, 0);
+
+	const auto s1Index = store.add(s1);
+
+	mu_check(store.size() == 1);
+
+	const auto s1Pairs = store.getUnsafe(s1Index);
+	mu_check(s1Pairs.size() == 5);
+	const auto str1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("str1");
+	});
+	mu_check(str1 != s1Pairs.end());
+	mu_check((*str1)->hasStringValue());
+	mu_check((*str1)->stringValue() == "someval");
+
+	const auto str2 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("str2");
+	});
+	mu_check(str2 != s1Pairs.end());
+	mu_check((*str2)->hasStringValue());
+	mu_check((*str2)->stringValue() == "a very long string");
+
+	const auto bool1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("bool1");
+	});
+	mu_check(bool1 != s1Pairs.end());
+	mu_check((*bool1)->hasBoolValue());
+	mu_check((*bool1)->boolValue() == false);
+
+	const auto bool2 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("bool2");
+	});
+	mu_check(bool2 != s1Pairs.end());
+	mu_check((*bool2)->hasBoolValue());
+	mu_check((*bool2)->boolValue() == true);
+
+	const auto float1 = std::find_if(s1Pairs.begin(), s1Pairs.end(), [&store](auto ap) {
+			return ap->keyIndex == store.keyStore.key2index("float1");
+	});
+	mu_check(float1 != s1Pairs.end());
+	mu_check((*float1)->hasFloatValue());
+	mu_check((*float1)->floatValue() == 42);
+}
+
+MU_TEST(test_attribute_store_reuses) {
+	AttributeStore store;
+	store.reset();
+
+	mu_check(store.size() == 0);
+
+	{
+		AttributeSet s1a;
+		store.addAttribute(s1a, "str1", std::string("someval"), 0);
+		const auto s1aIndex = store.add(s1a);
+
+		AttributeSet s1b;
+		store.addAttribute(s1b, "str1", std::string("someval"), 0);
+		const auto s1bIndex = store.add(s1b);
+
+		mu_check(s1aIndex == s1bIndex);
+	}
+
+	{
+		AttributeSet s1a;
+		store.addAttribute(s1a, "str1", std::string("this is a very long string"), 0);
+		const auto s1aIndex = store.add(s1a);
+
+		AttributeSet s1b;
+		store.addAttribute(s1b, "str1", std::string("this is a very long string"), 0);
+		const auto s1bIndex = store.add(s1b);
+
+		mu_check(s1aIndex == s1bIndex);
+	}
+
+
+}
+
+MU_TEST_SUITE(test_suite_attribute_store) {
+	MU_RUN_TEST(test_attribute_store);
+	MU_RUN_TEST(test_attribute_store_reuses);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_attribute_store);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/deque_map.test.cpp b/test/deque_map.test.cpp
new file mode 100644
index 00000000..28023542
--- /dev/null
+++ b/test/deque_map.test.cpp
@@ -0,0 +1,67 @@
+#include <iostream>
+#include <algorithm>
+#include "external/minunit.h"
+#include "deque_map.h"
+
+MU_TEST(test_deque_map) {
+	DequeMap<std::string> strs;
+
+	mu_check(strs.size() == 0);
+	mu_check(!strs.full());
+	mu_check(strs.find("foo") == -1);
+	mu_check(strs.add("foo") == 0);
+	mu_check(!strs.full());
+	mu_check(strs.find("foo") == 0);
+	mu_check(strs.size() == 1);
+	mu_check(strs.add("foo") == 0);
+	mu_check(strs.size() == 1);
+	mu_check(strs.add("bar") == 1);
+	mu_check(strs.size() == 2);
+	mu_check(strs.add("aardvark") == 2);
+	mu_check(strs.size() == 3);
+	mu_check(strs.add("foo") == 0);
+	mu_check(strs.add("bar") == 1);
+	mu_check(strs.add("quux") == 3);
+	mu_check(strs.size() == 4);
+
+	mu_check(strs.at(0) == "foo");
+	mu_check(strs[0] == "foo");
+	mu_check(strs.at(1) == "bar");
+	mu_check(strs[1] == "bar");
+	mu_check(strs.at(2) == "aardvark");
+	mu_check(strs[2] == "aardvark");
+	mu_check(strs.at(3) == "quux");
+	mu_check(strs[3] == "quux");
+
+	std::vector<std::string> rv;
+	for (std::string x : strs) {
+		rv.push_back(x);
+	}
+	mu_check(rv[0] == "aardvark");
+	mu_check(rv[1] == "bar");
+	mu_check(rv[2] == "foo");
+	mu_check(rv[3] == "quux");
+
+	DequeMap<std::string> boundedMap(1);
+	mu_check(!boundedMap.full());
+	mu_check(boundedMap.add("foo") == 0);
+	mu_check(boundedMap.add("foo") == 0);
+	mu_check(boundedMap.full());
+	mu_check(boundedMap.add("bar") == -1);
+	boundedMap.clear();
+	mu_check(!boundedMap.full());
+	mu_check(boundedMap.find("foo") == -1);
+	mu_check(boundedMap.add("bar") == 0);
+	mu_check(boundedMap.add("bar") == 0);
+	mu_check(boundedMap.full());
+}
+
+MU_TEST_SUITE(test_suite_deque_map) {
+	MU_RUN_TEST(test_deque_map);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_deque_map);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/monaco.pbf b/test/monaco.pbf
new file mode 100644
index 00000000..6e6c3122
Binary files /dev/null and b/test/monaco.pbf differ
diff --git a/test/options_parser.test.cpp b/test/options_parser.test.cpp
new file mode 100644
index 00000000..e230fc0d
--- /dev/null
+++ b/test/options_parser.test.cpp
@@ -0,0 +1,107 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "options_parser.h"
+
+const char* PROGRAM_NAME = "./tilemaker";
+using namespace OptionsParser;
+
+Options parse(std::vector<std::string>& args) {
+	const char* argv[100];
+
+	argv[0] = PROGRAM_NAME;
+	for(int i = 0; i < args.size(); i++)
+		argv[1 + i] = args[i].data();
+
+	return parse(1 + args.size(), argv);
+}
+
+#define ASSERT_THROWS(MESSAGE, ...) \
+{ \
+	std::vector<std::string> args = { __VA_ARGS__ }; \
+	bool threw = false; \
+	try { \
+		auto opts = parse(args); \
+	} catch(OptionsParser::OptionException& e) { \
+		threw = std::string(e.what()).find(MESSAGE) != std::string::npos; \
+	} \
+	if (!threw) mu_check((std::string("expected exception with ") + MESSAGE).empty()); \
+}
+
+MU_TEST(test_options_parser) {
+	// No args is invalid.
+	ASSERT_THROWS("You must specify an output file");
+
+	// Output without input is invalid
+	ASSERT_THROWS("No source .osm.pbf", "--output", "foo.mbtiles");
+
+	// You can ask for --help.
+	{
+		std::vector<std::string> args = {"--help"};
+		auto opts = parse(args);
+		mu_check(opts.showHelp);
+	}
+
+	// Minimal valid is output and input
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.osm.materializeGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
+	// --lazy-geometries overrides default
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--lazy-geometries"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(opts.osm.lazyGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
+	// --store should optimize for reduced memory
+	{
+		std::vector<std::string> args = {"--output", "foo.mbtiles", "--input", "ontario.pbf", "--store", "/tmp/store"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.mbtiles");
+		mu_check(opts.outputMode == OutputMode::MBTiles);
+		mu_check(opts.osm.storeFile == "/tmp/store");
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(opts.osm.shardStores);
+	}
+
+	// --store --fast should optimize for speed
+	{
+		std::vector<std::string> args = {"--output", "foo.pmtiles", "--input", "ontario.pbf", "--store", "/tmp/store", "--fast"};
+		auto opts = parse(args);
+		mu_check(opts.inputFiles.size() == 1);
+		mu_check(opts.inputFiles[0] == "ontario.pbf");
+		mu_check(opts.outputFile == "foo.pmtiles");
+		mu_check(opts.outputMode == OutputMode::PMTiles);
+		mu_check(opts.osm.storeFile == "/tmp/store");
+		mu_check(!opts.osm.materializeGeometries);
+		mu_check(!opts.osm.shardStores);
+	}
+
+	ASSERT_THROWS("Couldn't open .json config", "--input", "foo", "--output", "bar", "--config", "nonexistent-config.json");
+	ASSERT_THROWS("Couldn't open .lua script", "--input", "foo", "--output", "bar", "--process", "nonexistent-script.lua");
+}
+
+MU_TEST_SUITE(test_suite_options_parser) {
+	MU_RUN_TEST(test_options_parser);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_options_parser);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/pbf_reader.test.cpp b/test/pbf_reader.test.cpp
new file mode 100644
index 00000000..8d4c8fad
--- /dev/null
+++ b/test/pbf_reader.test.cpp
@@ -0,0 +1,135 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include "external/minunit.h"
+#include "pbf_reader.h"
+
+MU_TEST(test_pbf_reader) {
+	std::string filename;
+	filename = "test/monaco.pbf";
+//	filename = "/home/cldellow/Downloads/north-america-latest.osm.pbf";
+//	filename = "/home/cldellow/Downloads/great-britain-latest.osm.pbf";
+//	filename = "/home/cldellow/Downloads/nova-scotia-latest.osm.pbf";
+	std::ifstream monaco(filename, std::ifstream::in);
+
+	PbfReader::PbfReader reader;
+	PbfReader::BlobHeader bh = reader.readBlobHeader(monaco);
+	protozero::data_view blob = reader.readBlob(bh.datasize, monaco);
+	PbfReader::HeaderBlock header = reader.readHeaderBlock(blob);
+
+	mu_check(header.hasBbox);
+	mu_check(header.optionalFeatures.size() == 1);
+	mu_check(header.optionalFeatures.find("Sort.Type_then_ID") != header.optionalFeatures.end());
+
+	mu_check(header.bbox.minLon == 7.409205);
+	mu_check(header.bbox.maxLon == 7.448637);
+	mu_check(header.bbox.minLat == 43.723350);
+	mu_check(header.bbox.maxLat == 43.751690);
+
+
+	bool foundNode = false, foundWay = false, foundRelation = false;
+	int blocks = 0, groups = 0, strings = 0, nodes = 0, ways = 0, relations = 0;
+	while (!monaco.eof()) {
+		bh = reader.readBlobHeader(monaco);
+		if (bh.type == "eof")
+			break;
+
+
+		blocks++;
+		blob = reader.readBlob(bh.datasize, monaco);
+
+		PbfReader::PrimitiveBlock pb = reader.readPrimitiveBlock(blob);
+
+		for (const auto str : pb.stringTable) {
+			if (strings == 200) {
+				std::string s(str.data(), str.size());
+				mu_check(s == "description:FR");
+			}
+			strings++;
+		}
+
+		for (const auto& group : pb.groups()) {
+			groups++;
+			for (const auto& node : group.nodes()) {
+				nodes++;
+
+				if (node.id == 21911886) {
+					foundNode = true;
+
+					bool foundHighwayCrossing = false;
+
+					for (int i = node.tagStart; i < node.tagEnd; i += 2) {
+						const auto keyIndex = group.translateNodeKeyValue(i);
+						const auto valueIndex = group.translateNodeKeyValue(i + 1);
+						std::string key(pb.stringTable[keyIndex].data(), pb.stringTable[keyIndex].size());
+						std::string value(pb.stringTable[valueIndex].data(), pb.stringTable[valueIndex].size());
+
+						if (key == "highway" && value == "crossing")
+							foundHighwayCrossing = true;
+					}
+					mu_check(foundHighwayCrossing);
+				}
+			}
+
+			for (const auto& way : group.ways()) {
+				ways++;
+
+				if (way.id == 4224978) {
+					foundWay = true;
+
+					bool foundSportSoccer = false;
+					for (int i = 0; i < way.keys.size(); i++) {
+						std::string key(pb.stringTable[way.keys[i]].data(), pb.stringTable[way.keys[i]].size());
+						std::string value(pb.stringTable[way.vals[i]].data(), pb.stringTable[way.vals[i]].size());
+
+						if (key == "sport" && value == "soccer")
+							foundSportSoccer = true;
+					}
+					mu_check(foundSportSoccer);
+
+					mu_check(way.refs.size() == 5);
+					mu_check(way.refs[0] == 25178088);
+					mu_check(way.refs[2] == 25178045);
+					mu_check(way.refs[4] == 25178088);
+				}
+			}
+
+			for (const auto& relation : group.relations()) {
+				relations++;
+
+				if (relation.id == 1124039) {
+					foundRelation = true;
+					mu_check(relation.memids.size() == 17);
+					mu_check(relation.types.size() == 17);
+					mu_check(relation.roles_sid.size() == 17);
+					mu_check(relation.types[0] == PbfReader::Relation::MemberType::NODE);
+					mu_check(relation.types[2] == PbfReader::Relation::MemberType::WAY);
+					mu_check(relation.types[16] == PbfReader::Relation::MemberType::RELATION);
+				}
+			}
+		}
+	}
+
+	//std::cout << blocks << " blocks, " << groups << " groups, " << nodes << " nodes, " << ways << " ways, " << relations << " relations" << std::endl;
+
+	mu_check(foundNode);
+	mu_check(foundWay);
+	mu_check(foundRelation);
+
+	mu_check(blocks == 6);
+	mu_check(groups == 6);
+	mu_check(strings == 8236);
+	mu_check(nodes == 30477);
+	mu_check(ways == 4825);
+	mu_check(relations == 285);
+}
+
+MU_TEST_SUITE(test_suite_pbf_reader) {
+	MU_RUN_TEST(test_pbf_reader);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_pbf_reader);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/pooled_string.test.cpp b/test/pooled_string.test.cpp
new file mode 100644
index 00000000..91fb2da5
--- /dev/null
+++ b/test/pooled_string.test.cpp
@@ -0,0 +1,55 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "pooled_string.h"
+
+MU_TEST(test_pooled_string) {
+	mu_check(PooledString("").size() == 0);
+	mu_check(PooledString("").toString() == "");
+	mu_check(PooledString("f").size() == 1);
+	mu_check(PooledString("f").toString() == "f");
+	mu_check(PooledString("hi").size() == 2);
+	mu_check(PooledString("f") == PooledString("f"));
+	mu_check(PooledString("f") != PooledString("g"));
+
+	mu_check(PooledString("this is more than fifteen bytes").size() == 31);
+	mu_check(PooledString("this is more than fifteen bytes") != PooledString("f"));
+
+	PooledString big("this is also a really long string");
+	mu_check(big == big);
+	mu_check(big.toString() == "this is also a really long string");
+
+	PooledString big2("this is also a quite long string");
+	mu_check(big != big2);
+	mu_check(big.toString() != big2.toString());
+
+	std::string shortString("short");
+	std::string longString("this is a very long string");
+
+	PooledString stdShortString(&shortString);
+	mu_check(stdShortString.size() == 5);
+	mu_check(stdShortString.toString() == "short");
+
+	PooledString stdLongString(&longString);
+	mu_check(stdLongString.size() == 26);
+	mu_check(stdLongString.toString() == "this is a very long string");
+
+	// PooledStrings that are backed by std::string have the usual
+	// == semantics.
+	mu_check(stdShortString == PooledString("short"));
+	mu_check(PooledString("short") == stdShortString);
+
+	mu_check(stdLongString == PooledString("this is a very long string"));
+	mu_check(PooledString("this is a very long string") == stdLongString);
+
+	mu_check(stdShortString != stdLongString);
+}
+
+MU_TEST_SUITE(test_suite_pooled_string) {
+	MU_RUN_TEST(test_pooled_string);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_pooled_string);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/sorted_node_store.test.cpp b/test/sorted_node_store.test.cpp
new file mode 100644
index 00000000..de66445f
--- /dev/null
+++ b/test/sorted_node_store.test.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include "external/minunit.h"
+#include "sorted_node_store.h"
+
+MU_TEST(test_sorted_node_store) {
+	bool compressed = true;
+
+	for (int i = 0; i < 2; i++) {
+		compressed = !compressed;
+		SortedNodeStore s1(compressed), s2(compressed);
+		mu_check(s1.size() == 0);
+		mu_check(s2.size() == 0);
+
+		s1.batchStart();
+		s2.batchStart();
+
+		s1.insert({ {1, {2, 3 } } });
+		s2.insert({ {2, {3, 4 } } });
+
+		s1.finalize(1);
+		s2.finalize(1);
+
+		mu_check(s1.size() == 1);
+		mu_check(s1.at(1) == LatpLon({2, 3}));
+		mu_check(s1.contains(0, 1));
+		mu_check(!s1.contains(0, 2));
+		mu_check(!s1.contains(0, 1ull << 34));
+		mu_check(s2.size() == 1);
+		mu_check(s2.at(2) == LatpLon({3, 4}));
+	}
+}
+
+MU_TEST_SUITE(test_suite_sorted_node_store) {
+	MU_RUN_TEST(test_sorted_node_store);
+}
+
+int main() {
+	MU_RUN_SUITE(test_suite_sorted_node_store);
+	MU_REPORT();
+	return MU_EXIT_CODE;
+}
diff --git a/test/sorted_way_store.test.cpp b/test/sorted_way_store.test.cpp
index 1c50a494..65d34816 100644
--- a/test/sorted_way_store.test.cpp
+++ b/test/sorted_way_store.test.cpp
@@ -13,6 +13,10 @@ class TestNodeStore : public NodeStore {
 		return { (int32_t)id, -(int32_t)id };
 	}
 	void insert(const std::vector<std::pair<NodeID, LatpLon>>& elements) override {}
+
+	bool contains(size_t shard, NodeID id) const override { return true; }
+	size_t shard() const override { return 0; }
+	size_t shards() const override { return 1; }
 };
 
 void roundtripWay(const std::vector<NodeID>& way) {
@@ -70,6 +74,39 @@ MU_TEST(test_encode_way) {
 	}
 }
 
+MU_TEST(test_multiple_stores) {
+	bool compressed = false;
+
+	for (int i = 0; i < 2; i++) {
+		compressed = !compressed;
+		TestNodeStore ns;
+		SortedWayStore s1(compressed, ns), s2(compressed, ns);
+		s1.batchStart();
+		s2.batchStart();
+
+		s1.insertNodes({{ 1, { 1 } }});
+
+		// We store small ways differently than large ways, so
+		// store both kinds for testing.
+		std::vector<NodeID> longWay;
+		for (int i = 200; i < 2048; i++)
+			longWay.push_back(i + 3 * (i % 37));
+
+		s1.insertNodes({{ 42, longWay }});
+		s2.insertNodes({{ 2, { 2 } }});
+
+		s1.finalize(1);
+		s2.finalize(1);
+
+		mu_check(s1.size() == 2);
+		mu_check(s2.size() == 1);
+
+		mu_check(s1.contains(0, 1));
+		mu_check(s1.contains(0, 42));
+		mu_check(!s1.contains(0, 2));
+	}
+}
+
 MU_TEST(test_way_store) {
 	TestNodeStore ns;
 	SortedWayStore sws(true, ns);
@@ -178,6 +215,7 @@ MU_TEST(test_populate_mask) {
 
 MU_TEST_SUITE(test_suite_sorted_way_store) {
 	MU_RUN_TEST(test_encode_way);
+	MU_RUN_TEST(test_multiple_stores);
 	MU_RUN_TEST(test_way_store);
 }