From cbf2fb8a957f698508473abef5dad4a5d432c993 Mon Sep 17 00:00:00 2001 From: Jovan Veljanoski Date: Thu, 24 Nov 2022 23:19:05 +0100 Subject: [PATCH 1/2] test(hdf5): reliable export hdf5 with small chunks size --- tests/export_test.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/export_test.py b/tests/export_test.py index 9e8742887a..41c40f6299 100644 --- a/tests/export_test.py +++ b/tests/export_test.py @@ -311,3 +311,36 @@ def test_export_hdf5_missing_values(tmpdir): assert df2.x.tolist() == [1, None, 5, None, 10] assert df2.y.tolist() == [1.1, None, 5.5, None, 10.10] assert df2.z.tolist() == ['Yes', None, 'No', None, 'Maybe'] + +@pytest.mark.parametrize("chunk_size", [3, 33]) +def test_export_hdf5_small_chunks_case_1(tmpdir, chunk_size): + x = pa.array([1, 2, None, None, 5]) + y = pa.array([1.1, 2.2, None, None, 5.5]) + s = pa.array(['dog', 'cat', None, None, 'mouse']) + df = vaex.from_arrays(x=x, y=y, s=s) + + export_path = str(tmpdir.join('tmp.hdf5')) + df.export_hdf5(export_path, chunk_size=chunk_size) + + df_hdf5 = vaex.open(export_path) + assert df_hdf5.shape == (5, 3) + assert df_hdf5.x.tolist() == [1, 2, None, None, 5] + assert df_hdf5.y.tolist() == [1.1, 2.2, None, None, 5.5] + assert df_hdf5.s.tolist() == ['dog', 'cat', None, None, 'mouse'] + + +@pytest.mark.parametrize("chunk_size", [3, 33]) +def test_export_hdf5_small_chunks_case_2(tmpdir, chunk_size): + x = pa.array([1, 2, None, None, None, None, None, None, 5]) + y = pa.array([1.1, 2.2, None, None, None, None, None, None, 5.5]) + s = pa.array(['dog', 'cat', None, None, None, None, None, None, 'mouse']) + df = vaex.from_arrays(x=x, y=y, s=s) + + export_path = str(tmpdir.join('tmp.hdf5')) + df.export_hdf5(export_path, chunk_size=chunk_size) + + df_hdf5 = vaex.open(export_path) + assert df_hdf5.shape == (9, 3) + assert df_hdf5.x.tolist() == [1, 2, None, None, None, None, None, None, 5] + assert df_hdf5.y.tolist() == [1.1, 2.2, None, None, None, None, None, None, 5.5] + assert df_hdf5.s.tolist() == ['dog', 'cat', None, None, None, None, None, None, 'mouse'] From 717ea405c0fa17a59e42d62d1f4b892ef6bd4159 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Fri, 2 Dec 2022 13:04:42 +0100 Subject: [PATCH 2/2] fix --- packages/vaex-hdf5/vaex/hdf5/writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/vaex-hdf5/vaex/hdf5/writer.py b/packages/vaex-hdf5/vaex/hdf5/writer.py index c738ec46ee..d78eefc617 100644 --- a/packages/vaex-hdf5/vaex/hdf5/writer.py +++ b/packages/vaex-hdf5/vaex/hdf5/writer.py @@ -91,6 +91,7 @@ def layout(self, df, progress=None): self._layout_called = True def write(self, df, chunk_size=int(1e5), parallel=True, progress=None, column_count=1, export_threads=0): + chunk_size = ((chunk_size + 7) // 8) * 8 # round up to multiple of 8 assert self._layout_called, "call .layout() first" N = len(df) if N == 0: @@ -247,7 +248,7 @@ def write(self, values): raise ValueError("Cannot write to non-byte aligned offset") null_buffer = values.buffers()[0] if null_buffer is not None: - self.null_bitmap_array[byte_index1:byte_index2] = memoryview(null_buffer) + self.null_bitmap_array[byte_index1:byte_index2] = memoryview(null_buffer[:byte_index2-byte_index1]) else: self.null_bitmap_array[byte_index1:byte_index2] = 0xff if np.ma.isMaskedArray(self.to_array) and np.ma.isMaskedArray(values):