voltrondata-labs · jgehrcke · Dec 15, 2022 · Dec 9, 2022 · Dec 12, 2022 · Dec 12, 2022
diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
@@ -34,7 +34,10 @@ jobs:
         run: |
           flake8
       - name: Lint (isort)
+        # Note(JP): if --check fails then it's not always obvious why.
+        # Show diff beforehand.
         run: |
+          isort --diff .
           isort --check .
       - name: Install libcurl (for R arrow)
         run: |

diff --git a/benchmarks.json b/benchmarks.json
@@ -67,6 +67,12 @@
       "language": "Python"
     }
   },
+  {
+    "command": "dataset-serialize ALL --iterations=3 --all=true --drop-caches=true",
+    "flags": {
+      "language": "Python"
+    }
+  },
   {
     "command": "file-read ALL --iterations=3 --all=true --drop-caches=true",
     "flags": {

diff --git a/benchmarks/_sources.py b/benchmarks/_sources.py
@@ -1,4 +1,5 @@
 import functools
+import logging
 import os
 import pathlib
 from enum import Enum
@@ -15,6 +16,9 @@
 temp_dir = os.path.join(data_dir, "temp")
 
 
+log = logging.getLogger(__name__)
+
+
 def _local(name):
     """Sources for unit testing, committed to benchmarks/data."""
     return os.path.join(local_data_dir, name)
@@ -435,8 +439,10 @@ def table(self):
         return self._table
 
     def _get_object_url(self, idx=0):
+        log.info("_get_object_url for idx %s", idx)
         if self.paths:
             s3_url = pathlib.Path(self.paths[idx])
+            log.info("s3_url: %s", s3_url)
             return (
                 "https://"
                 + s3_url.parts[0]
@@ -452,11 +458,16 @@ def download_source_if_not_exists(self):
         for idx, p in enumerate(self.source_paths):
             path = pathlib.Path(p)
             if not path.exists():
+                log.info("path does not exist: %s", path)
                 path.parent.mkdir(parents=True, exist_ok=True)
-                source = self.store.get("source")
-                if not source:
-                    source = self._get_object_url(idx)
-                r = requests.get(source)
+
+                url = self.store.get("source")
+                if not url:
+                    url = self._get_object_url(idx)
+
+                log.info("HTTP GET %s", url)
+                r = requests.get(url)
+                log.info("write response to disk")
                 open(path, "wb").write(r.content)
 
     def _csv_write(self, table, path, compression):

diff --git a/benchmarks/dataset_serialize_benchmark.py b/benchmarks/dataset_serialize_benchmark.py
@@ -0,0 +1,272 @@
+import itertools
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import time
+import uuid
+
+import conbench.runner
+import pyarrow
+import pyarrow.dataset as ds
+
+from benchmarks import _benchmark
+
+log = logging.getLogger(__name__)
+
+
+# All benchmark scnearios will write below /dev/shm/<SHM_DIR_PREFIX>. That
+# directory tree is removed upon completion (not necessarily in case of error
+# though).
+OUTPUT_DIR_PREFIX = os.path.join("/dev/shm/", "bench-" + str(uuid.uuid4())[:8])
+
+
+@conbench.runner.register_benchmark
+class DatasetSerializeBenchmark(_benchmark.Benchmark):
+    """
+    This benchmark is supposed to measure the time it takes to write data from
+    memory (from a pyarrow Table) to a tmpfs file system, given a specific
+    serialization format (parquet, arrow, ...).
+
+    To make this benchmark agnostic to disk read performance on the input side
+    of things, the data is read fully into memory before starting (and timing)
+    the benchmark function. That is (believed to be) achieved with:
+
+        data = source_dataset.to_table(filter=somefilter, ...)
 return lambda: dataset.to_table( 
 return lambda: dataset.to_table(filter=(vendor == "DDS") & (count > 3)) 
 return lambda: dataset.to_table( 
 return lambda: dataset.to_table(filter=(vendor == "DDS") & (count > 3)) 
+
+    After data of interest has been read into memory, the following call is
+    used for both serialization and writing-to-filesystem in one go:
+
+        pyarrow.dataset.write_dataset(format=someformat, ...)
+
+    That operation is timed (and the duration is the major output of this
+    benchmark).
+
+    The data is written to `/dev/shm` (available on all Linux systems). This is
+    a file system backed by RAM (tmpfs). The assumption is that writing to
+    tmpfs is fast (so fast that benchmark duration is significantly affected by
+    the serialization itself), and stable (so that its performance is ~constant
+    across runs on the same machine).
+
+    This benchmark does not resolve how much time goes into the CPU work for
+    serialization vs. the system calls for writing to tmpfs (that would be a
+    different question to answer, an interesting one, that is maybe more of a
+    task for profiling).
+
+    There are two dimensions that are varied:
+
+        - serialization format
+        - amount of the data being written, as set by a filter on the input
+
+    A note about /dev/shm: it's of great value because
+
+    - unprivileges users can write to it
+    - the `base_dir` arg to pyarrow.dataset.write_dataset() requires a path to
+      a directory. That is, one cannot inject a memory-backed Python file
+      object (a strategy that's elsewhere often used to simulate writing to an
+      actual file)
+    - it is not available on MacOS which is why we skip this test
+
+    """
+
+    name = "dataset-serialize"
+
+    arguments = ["source"]
+
+    sources = [
+        "nyctaxi_multi_parquet_s3",
+        "nyctaxi_multi_ipc_s3",
+        # "chi_traffic_2020_Q1",
+    ]
+
+    sources_test = [
+        "nyctaxi_multi_parquet_s3_sample",
+        "nyctaxi_multi_ipc_s3_sample",
+        "chi_traffic_sample",
+    ]
+
+    _params = {
+        "selectivity": ("1pc", "10pc"),
+        "format": (
+            "parquet",
+            "arrow",
+            "ipc",
+            "feather",
+            "csv",
+        ),
+    }
+
+    valid_cases = [tuple(_params.keys())] + list(
+        itertools.product(*[v for v in _params.values()])
+    )
+
+    filters = {
+        "nyctaxi_multi_parquet_s3": {
+            "1pc": ds.field("pickup_longitude") < -74.013451,  # 561384
+            "10pc": ds.field("pickup_longitude") < -74.002055,  # 5615432
+            "100pc": None,  # 56154689
+        },
+        "nyctaxi_multi_ipc_s3": {
+            "1pc": ds.field("pickup_longitude") < -74.014053,  # 596165
+            "10pc": ds.field("pickup_longitude") < -74.002708,  # 5962204
+            "100pc": None,  # 59616487
+        },
+        "chi_traffic_2020_Q1": {
+            "1pc": ds.field("END_LONGITUDE") < -87.807262,  # 124530
+            "10pc": ds.field("END_LONGITUDE") < -87.7624,  # 1307565
+            "100pc": None,  # 13038291
+        },
+        **dict.fromkeys(
+            ["nyctaxi_multi_parquet_s3_sample", "nyctaxi_multi_ipc_s3_sample"],
+            {
+                "1pc": ds.field("pickup_longitude") < -74.0124,  # 20
+                "10pc": ds.field("pickup_longitude") < -74.00172,  # 200
+                "100pc": None,  # 2000
+            },
+        ),
+        "chi_traffic_sample": {
+            "1pc": ds.field("END_LONGITUDE") < -87.80726,  # 10
+            "10pc": ds.field("END_LONGITUDE") < -87.76148,  # 100
+            "100pc": None,  # 1000
+        },
+    }
+
+    _case_tmpdir_mapping = {}
+
+    def _create_tmpdir_in_ramdisk(self, case: tuple):
+        # Build simple prefix string for specific test case to facilitate
+        # correlating directory names to test cases.
+        pfx = "-".join(c.lower()[:9] for c in case)
+        dirpath = os.path.join(OUTPUT_DIR_PREFIX, pfx + "-" + str(uuid.uuid4()))
+
+        self._case_tmpdir_mapping[tuple(case)] = dirpath
+
+        os.makedirs(dirpath, exist_ok=False)
+        return dirpath
+
+    def _get_dataset_for_source(self, source) -> ds.Dataset:
+        """Helper to construct a Dataset object."""
+
+        return pyarrow.dataset.dataset(
+            source.source_paths,
+            schema=pyarrow.dataset.dataset(
+                source.source_paths[0], format=source.format_str
+            ).schema,
+            format=source.format_str,
+        )
+
+    def _report_dirsize_and_wipe(self, dirpath: str):
+        """
+        This module already has a dependency on Linux so we can just as well
+        spawn `du` for correct recursive directory size reporting"""
+
+        ducmd = ["du", "-sh", dirpath]
+        p = subprocess.run(ducmd, capture_output=True)
+        log.info("stdout of %s: %s", ducmd, p.stdout.decode("utf-8").split()[0])
+        if p.returncode != 0:
+            log.info("stderr of %s: %s", ducmd, p.stderr)
+        log.info("removing directory: %s", dirpath)
+        shutil.rmtree(dirpath)
+
+    def run(self, source, case=None, **kwargs):
+
+        if not os.path.exists("/dev/shm"):
+            sys.exit("/dev/shm not found but required (not available on Darwin). Exit.")
+
+        cases = self.get_cases(case, kwargs)
+
+        for source in self.get_sources(source):
+
+            log.info("source %s: download, if required", source.name)
+            source.download_source_if_not_exists()
+            tags = self.get_tags(kwargs, source)
+
+            t0 = time.monotonic()
+            source_ds = self._get_dataset_for_source(source)
+            log.info(
+                "constructed Dataset object for source in %.4f s", time.monotonic() - t0
+            )
+
+            for case in cases:
+
+                log.info("case %s: create directory", case)
+                dirpath = self._create_tmpdir_in_ramdisk(case)
+                log.info("directory created, path: %s", dirpath)
+
+                yield self.benchmark(
+                    f=self._get_benchmark_function(
+                        case, source.name, source_ds, dirpath
+                    ),
+                    extra_tags=tags,
+                    options=kwargs,
+                    case=case,
+                )
+
+                # Free up memory in the RAM disk (tmpfs), assuming that we're
+                # otherwise getting close to filling it (depending on the
+                # machine this is executed on, a single test might easily
+                # occupy 10 % or more of this tmpfs). Note that what
+                # accumulated in `dirpath` is the result of potentially
+                # multiple iterations.
+                self._report_dirsize_and_wipe(dirpath)
+
+        # Finally, remove outest directory. Should have no contents by now, but
+        # if an individual benchmark iteration was Ctrl+C'd then this here
+        # might still do useful cleanup.
+        self._report_dirsize_and_wipe(OUTPUT_DIR_PREFIX)
+
+    def _get_benchmark_function(
+        self, case, source_name: str, source_ds: ds.Dataset, dirpath: str
+    ):
+
+        (selectivity, serialization_format) = case
+
+        # Option A: read-from-disk -> deserialize -> filter -> into memory
+        # before timing serialize -> write-to-tmpfs
+        t0 = time.monotonic()
+        data = source_ds.to_table(filter=self.filters[source_name][selectivity])
+        log.info("read source dataset into memory in %.4f s", time.monotonic() - t0)
+
+        # Option B (thrown away, but kept for posterity): use a Scanner()
+        # object to transparently filter the source dataset upon consumption,
+        # in which case what's timed is read-from-disk -> deserialize -> filter
+        # -> serialize -> write-to-tmpfs
+        #
+        # Note(JP): I have confirmed that for the data used in this benchmark
+        # this option is dominated by read-from-disk to the extent that no
+        # useful signal is generated for the write phase, at least for some
+        # serialization formats.
+        #
+        # data = pyarrow.dataset.Scanner.from_dataset(source_ds,
+        #     filter=self.filters[source_name][selectivity])
+
+        def benchfunc():
+            # This is a hack to make each iteration work in a separate
+            # directory (otherwise some write operations would error out saying
+            # that the target directory is not empty). With `benchrun` it will
+            # be easier to cleanly hook into doing resource management before
+            # and after an iteration w/o affecting the timing measurement.
+            # Assume that creating a directory does not significantly add to
+            # the duration of the actual payload function.
+            dp = os.path.join(dirpath, str(uuid.uuid4())[:8])
+            os.makedirs(dp)
+
+            # When dimensioning of benchmark parameters and execution
+            # environment are not adjusted to each other, tmpfs quickly gets
+            # full. In that case writing might fail with
+            #
+            #   File "pyarrow/_dataset.pyx", line 2859, in
+            #   pyarrow._dataset._filesystemdataset_write File
+            #   "pyarrow/error.pxi", line 113, in pyarrow.lib.check_status
+            #   OSError: [Errno 28] Error writing bytes to file. Detail: [errno
+            #   28] No space left on device
+
+            pyarrow.dataset.write_dataset(
+                data=data, format=serialization_format, base_dir=dp
+            )
+
+            # The benchmark function returns `None` for now. If we need
+            # deeper inspection into the result maybe iterate on that.
+
+        return benchfunc