Merge branch 'feature/append-mode'

tmontaigu · Sep 3, 2020 · 4944aab · 4944aab
2 parents cf1b84c + 7905534
commit 4944aab
Show file tree

Hide file tree

Showing 12 changed files with 345 additions and 69 deletions.
diff --git a/pylas/__init__.py b/pylas/__init__.py
@@ -6,6 +6,7 @@
 from .evlrs import EVLR
 from .headers import HeaderFactory
 from .laswriter import LasWriter
+from .errors import PylasError
 from .lib import LazBackend
 from .lib import convert, create_from_header
 from .lib import create_las as create

diff --git a/pylas/headers/rawheader.py b/pylas/headers/rawheader.py
@@ -8,6 +8,7 @@
 
 from .. import compression, utils
 from .. import errors
+from ..point.record import PointRecord
 
 logger = logging.getLogger(__name__)
 
@@ -255,6 +256,38 @@ def set_compressed(self, compressed: bool):
                 self._point_data_format_id
             )
 
+    def update(self, points: PointRecord) -> None:
+        self.x_max = max(
+            self.x_max,
+            (points["X"].max() * self.x_scale) + self.x_offset,
+        )
+        self.y_max = max(
+            self.y_max,
+            (points["Y"].max() * self.y_scale) + self.y_offset,
+        )
+        self.z_max = max(
+            self.z_max,
+            (points["Z"].max() * self.z_scale) + self.z_offset,
+        )
+        self.x_min = min(
+            self.x_min,
+            (points["X"].min() * self.x_scale) + self.x_offset,
+        )
+        self.y_min = min(
+            self.y_min,
+            (points["Y"].min() * self.y_scale) + self.y_offset,
+        )
+        self.z_min = min(
+            self.z_min,
+            (points["Z"].min() * self.z_scale) + self.z_offset,
+        )
+
+        for i, count in zip(*np.unique(points.return_number, return_counts=True)):
+            if i >= len(self.number_of_points_by_return):
+                break  # np.unique sorts unique values
+            self.number_of_points_by_return[i - 1] += count
+        self.point_count += len(points)
+
     def __repr__(self):
         return "<LasHeader({})>".format(self.version)
 

diff --git a/pylas/lasappender.py b/pylas/lasappender.py
@@ -0,0 +1,189 @@
+import io
+import math
+from typing import Union, Iterable
+
+from .compression import LazBackend
+from .errors import PylasError
+from .evlrs import EVLRList, RawEVLRList
+from .lasreader import LasReader, get_extra_dims_info_tuple
+from .laswriter import UncompressedPointWriter
+from .point.format import PointFormat
+from .point.record import PointRecord
+
+try:
+    import lazrs
+except ModuleNotFoundError:
+    pass
+
+
+class LazrsAppender:
+    def __init__(self, dest, header, vlrs, parallel):
+        self.dest = dest
+        self.offset_to_point_data = header.offset_to_point_data
+        laszip_vlr = vlrs.pop(vlrs.index("LasZipVlr"))
+
+        self.dest.seek(header.offset_to_point_data, io.SEEK_SET)
+        decompressor = lazrs.LasZipDecompressor(self.dest, laszip_vlr.record_data)
+        vlr = decompressor.vlr()
+        number_of_complete_chunk = int(
+            math.floor(header.point_count / vlr.chunk_size())
+        )
+
+        self.dest.seek(header.offset_to_point_data, io.SEEK_SET)
+        chunk_table = lazrs.read_chunk_table(self.dest)
+        if chunk_table is None:
+            # The file does not have a chunk table
+            # we cannot seek to the last chunk, so instead, we will
+            # decompress points (which is slower)
+
+            self.chunk_table = []
+            start_of_chunk = self.dest.tell()
+            point_buf = bytearray(vlr.chunk_size() * vlr.item_size())
+
+            for _ in range(number_of_complete_chunk):
+                decompressor.decompress_many(point_buf)
+                pos = self.dest.tell()
+                self.chunk_table.append(pos - start_of_chunk)
+                start_of_chunk = pos
+        else:
+            self.chunk_table = chunk_table[:-1]
+            idx_first_point_of_last_chunk = number_of_complete_chunk * vlr.chunk_size()
+            decompressor.seek(idx_first_point_of_last_chunk)
+
+        points_of_last_chunk = bytearray(
+            (header.point_count % vlr.chunk_size()) * vlr.item_size()
+        )
+        decompressor.decompress_many(points_of_last_chunk)
+
+        self.dest.seek(header.offset_to_point_data, io.SEEK_SET)
+        if parallel:
+            self.compressor = lazrs.ParLasZipCompressor(
+                self.dest, vlr
+            )  # This overwrites old offset
+        else:
+            self.compressor = lazrs.LasZipCompressor(
+                self.dest, vlr
+            )  # This overwrites the old offset
+        self.dest.seek(sum(self.chunk_table), io.SEEK_CUR)
+        self.compressor.compress_many(points_of_last_chunk)
+
+    def write_points(self, points):
+        self.compressor.compress_many(points.memoryview())
+
+    def done(self):
+        # The chunk table written is at the good position
+        # but it is incomplete (it's missing the chunk_table of
+        # chunks before the one we appended)
+        self.compressor.done()
+
+        # So we update it
+        self.dest.seek(self.offset_to_point_data, io.SEEK_SET)
+        offset_to_chunk_table = int.from_bytes(self.dest.read(8), "little", signed=True)
+        self.dest.seek(-8, io.SEEK_CUR)
+        chunk_table = self.chunk_table + lazrs.read_chunk_table(self.dest)
+        self.dest.seek(offset_to_chunk_table, io.SEEK_SET)
+        lazrs.write_chunk_table(self.dest, chunk_table)
+
+
+class LasAppender:
+    def __init__(
+            self,
+            dest,
+            laz_backend: Union[LazBackend, Iterable[LazBackend]] = (
+                    LazBackend.LazrsParallel,
+                    LazBackend.Lazrs,
+            ),
+            closefd: bool = True,
+    ) -> None:
+        if not dest.seekable():
+            raise TypeError("Expected the 'dest' to be a seekable file object")
+        header, vlrs = LasReader._read_header_and_vlrs(dest, seekable=True)
+
+        self.dest = dest
+        self.header = header
+        self.vlrs = vlrs
+        self.point_format = PointFormat(
+            self.header.point_format_id,
+            get_extra_dims_info_tuple(self.header, self.vlrs),
+        )
+
+        if not header.are_points_compressed:
+            self.points_writer = UncompressedPointWriter(self.dest)
+            self.dest.seek(
+                (self.header.point_count * self.header.point_size) + self.header.offset_to_point_data,
+                io.SEEK_SET
+            )
+        else:
+            self.points_writer = self._create_laz_backend(laz_backend)
+
+        if header.version >= "1.4" and header.number_of_evlr > 0:
+            assert self.dest.tell() <= self.header.start_of_first_evlr, "The position is past the start of evlrs"
+            pos = self.dest.tell()
+            self.dest.seek(self.header.start_of_first_evlr, io.SEEK_SET)
+            self.evlrs = EVLRList.read_from(self.dest, self.header.number_of_evlr)
+            dest.seek(self.header.start_of_first_evlr, io.SEEK_SET)
+            self.dest.seek(pos, io.SEEK_SET)
+        elif header.version >= "1.4":
+            self.evlrs = []
+
+        self.closefd = closefd
+
+    def append_points(self, points: PointRecord) -> None:
+        if points.point_format != self.point_format:
+            raise PylasError("Point formats do not match")
+
+        self.points_writer.write_points(points)
+        self.header.update(points)
+
+    def close(self) -> None:
+        self.points_writer.done()
+        self._write_evlrs()
+        self._write_updated_header()
+
+        if self.closefd:
+            self.dest.close()
+
+    def _write_evlrs(self):
+        if self.header.version >= "1.4" and len(self.evlrs) > 0:
+            self.header.number_of_evlr = len(self.evlrs)
+            self.header.start_of_first_evlr = self.dest.tell()
+            raw_evlrs = RawEVLRList.from_list(self.evlrs)
+            raw_evlrs.write_to(self.dest)
+
+    def _write_updated_header(self):
+        pos = self.dest.tell()
+        self.dest.seek(0, io.SEEK_SET)
+        self.header.write_to(self.dest)
+        self.dest.seek(pos, io.SEEK_SET)
+
+    def _create_laz_backend(
+            self,
+            laz_backend: Union[LazBackend, Iterable[LazBackend]] = (
+                    LazBackend.LazrsParallel,
+                    LazBackend.Lazrs,
+            ),
+    ) -> LazrsAppender:
+        try:
+            laz_backend = iter(laz_backend)
+        except TypeError:
+            laz_backend = (laz_backend,)
+
+        last_error = None
+        for backend in laz_backend:
+            if backend == LazBackend.Laszip:
+                raise PylasError("Laszip backend does not support appending")
+            elif backend == LazBackend.LazrsParallel:
+                return LazrsAppender(self.dest, self.header, self.vlrs, parallel=True)
+            elif backend == LazBackend.Lazrs:
+                return LazrsAppender(self.dest, self.header, self.vlrs, parallel=False)
+        else:
+            if last_error is not None:
+                raise PylasError(f"Could not initialize a laz backend: {last_error}")
+            else:
+                raise PylasError(f"No valid laz backend selected")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
diff --git a/pylas/lasreader.py b/pylas/lasreader.py
@@ -1,4 +1,5 @@
 import abc
+import io
 import logging
 import os
 import subprocess
@@ -13,7 +14,7 @@
 from .point import record, PointFormat
 from .point.dims import size_of_point_format_id
 from .utils import ConveyorThread
-from .vlrs.known import LasZipVlr, ExtraBytesVlr
+from .vlrs.known import LasZipVlr
 from .vlrs.vlrlist import VLRList
 
 try:
@@ -24,16 +25,33 @@
 logger = logging.getLogger(__name__)
 
 
+def get_extra_dims_info_tuple(header, vlrs) -> Optional[Tuple[Tuple[str, str], ...]]:
+    try:
+        extra_dims = vlrs.get("ExtraBytesVlr")[0].type_of_extra_dims()
+    except IndexError:
+        return None
+
+    point_size_without_extra_bytes = size_of_point_format_id(header.point_format_id)
+    if header.point_size == point_size_without_extra_bytes:
+        logger.warning(
+            "There is an ExtraByteVlr but the header.point_size matches the "
+            "point size without extra bytes. The extra bytes vlr info will be ignored"
+        )
+        vlrs.extract("ExtraBytesVlr")
+        extra_dims = None
+    return extra_dims
+
+
 class LasReader:
     """The reader class handles LAS and LAZ via one of the supported backend"""
 
     def __init__(
-        self,
-        source: BinaryIO,
-        closefd: bool = True,
-        laz_backends: Union[
-            LazBackend, Iterable[LazBackend]
-        ] = LazBackend.detect_available(),
+            self,
+            source: BinaryIO,
+            closefd: bool = True,
+            laz_backends: Union[
+                LazBackend, Iterable[LazBackend]
+            ] = LazBackend.detect_available(),
     ):
         self.closefd = closefd
         self.laz_backends = laz_backends
@@ -54,7 +72,8 @@ def __init__(
 
         self.points_read = 0
         self.point_format = PointFormat(
-            self.header.point_format_id, extra_dims=self._get_extra_dims()
+            self.header.point_format_id,
+            extra_dims=get_extra_dims_info_tuple(self.header, self.vlrs),
         )
 
     def read_n_points(self, n: int) -> Optional[record.ScaleAwarePointRecord]:
@@ -83,7 +102,11 @@ def read(self):
             points = record.PackedPointRecord.empty(self.point_format)
 
         if self.header.version >= "1.4":
-            evlrs = self._read_evlrs(self.point_source.source)
+            if self.header.are_points_compressed and not self.point_source.source.seekable():
+                # We explicitly require seekable stream because we have to seek
+                # past the chunk table of LAZ file
+                raise errors.PylasError("source must be seekable, to read evlrs form LAZ file")
+            evlrs = self._read_evlrs(self.point_source.source, seekable=True)
             las_data = las14.LasData(
                 header=self.header, vlrs=self.vlrs, points=points, evlrs=evlrs
             )
@@ -137,32 +160,14 @@ def _create_laz_backend(self, source):
             except errors.LazError as e:
                 logger.error(e)
 
-    def _get_extra_dims(self) -> Optional[Tuple[Tuple[str, str], ...]]:
-        try:
-            extra_dims = self.vlrs.get("ExtraBytesVlr")[0].type_of_extra_dims()
-        except IndexError:
-            return None
-
-        point_size_without_extra_bytes = size_of_point_format_id(
-            self.header.point_format_id
-        )
-        if self.header.point_size == point_size_without_extra_bytes:
-            logger.warning(
-                "There is an ExtraByteVlr but the header.point_size matches the "
-                "point size without extra bytes. The extra bytes vlr info will be ignored"
-            )
-            self.vlrs.extract("ExtraBytesVlr")
-            extra_dims = None
-        return extra_dims
-
     @staticmethod
     def _read_header_and_vlrs(source, seekable=True):
         header = headers.HeaderFactory().read_from_stream(source)
         vlrs = VLRList.read_from(source, num_to_read=header.number_of_vlr)
         if seekable:
             offset = header.offset_to_point_data - source.tell()
             if offset >= 0:
-                source.read(offset)
+                source.seek(offset, io.SEEK_CUR)
             else:
                 raise RuntimeError("Read past point data")  # TODO
         return header, vlrs
@@ -325,9 +330,7 @@ def __init__(self, source, laszip_vlr: LasZipVlr, parallel: bool) -> None:
                 source, laszip_vlr.record_data
             )
         else:
-            self.decompressor = lazrs.LasZipDecompressor(
-                laszip_vlr.record_data, self.source
-            )
+            self.decompressor = lazrs.LasZipDecompressor(source, laszip_vlr.record_data)
 
     def read_n_points(self, n) -> bytes:
         point_bytes = np.zeros(n * self.vlr.item_size(), np.uint8)