Skip to content

Commit

Permalink
馃悰 hdf5 export fails for concat df with missing columns (#1493)
Browse files Browse the repository at this point in the history
* test(core): hdf5 export fails for concat df with missing columns

* fix

Co-authored-by: Maarten A. Breddels <maartenbreddels@gmail.com>
  • Loading branch information
JovanVeljanoski and maartenbreddels committed Apr 11, 2022
1 parent 0b3a094 commit 95f43e1
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 0 deletions.
3 changes: 3 additions & 0 deletions packages/vaex-core/vaex/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,9 @@ def _decode(cls, encoding, spec):
return ds

def is_masked(self, column):
for dataset in self.datasets:
if column not in dataset:
return True
return any(k.is_masked(column) for k in self.datasets)

def shape(self, column):
Expand Down
2 changes: 2 additions & 0 deletions tests/concat_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ def test_concat_unaligned_schema(arrow):
df = df1.concat(df2)
assert df.x.tolist() == [1, 2, None, None]
assert df.y.tolist() == [None, None, 'd', 'e']
assert df.is_masked('x')
assert df.is_masked('y')
# always 'upcast' to Arrow arrays
# # rationale: Arrow will use use less memory, numpy has no efficient way to represent all missing data
assert df.x.data_type() == pa.float32()
Expand Down
14 changes: 14 additions & 0 deletions tests/export_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,20 @@ def test_export_open_hdf5(ds_local):
ds_opened = vaex.open(filename)
assert list(ds) == list(ds_opened)

def test_export_concat_missing_cols_hdf5(tmpdir):
df1 = vaex.from_arrays(x=[1, 2, 3], s=['x1', 'x2', 'x3'], y=[10, 20, 30])
df2 = vaex.from_arrays(x=[4, 5, 6])

df = vaex.concat([df1, df2])

filename = tempfile.mktemp(suffix='.hdf5')
df.export_hdf5(filename)

df_opened = vaex.open(filename)
assert df_opened.x.tolist() == [1, 2, 3, 4, 5, 6]
assert df_opened.y.tolist() == [10, 20, 30, None, None, None]
assert df_opened.s.tolist() == ['x1', 'x2', 'x3', None, None, None]

def test_export_open_csv(ds_local, tmpdir):
df = ds_local
path = str(tmpdir.join('test.csv'))
Expand Down

0 comments on commit 95f43e1

Please sign in to comment.