From 2986d104e45554731d8af2d5ffed21d9a8144afa Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Thu, 17 Jun 2021 10:56:10 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fingerprints=20using=20df.copy?= =?UTF-8?q?=20and=20projecting=20were=20not=20deterministic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/vaex-core/vaex/dataframe.py | 3 +++ packages/vaex-core/vaex/dataset.py | 11 +++++++--- tests/dataset_test.py | 5 +++++ tests/fingerprint_test.py | 33 ++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 3 deletions(-) diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index 6a828758f0..52c0fe73fa 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -5631,6 +5631,9 @@ def track(name): # first create the DataFrame with real data (dataset) dataset_columns = {k for k in required if k in self.dataset} + # we want a deterministic order for fingerprinting + dataset_columns = list(dataset_columns) + dataset_columns.sort() dataset = self.dataset.project(*dataset_columns) df = vaex.from_dataset(dataset) diff --git a/packages/vaex-core/vaex/dataset.py b/packages/vaex-core/vaex/dataset.py index e44a9bb359..9f4cf55c9f 100644 --- a/packages/vaex-core/vaex/dataset.py +++ b/packages/vaex-core/vaex/dataset.py @@ -403,6 +403,9 @@ def row_count(self): def project(self, *names): all = set(self) drop = all - set(names) + # we want a deterministic order for fingerprints + drop = list(drop) + drop.sort() return self.dropped(*list(drop)) def concat(self, *others, resolver='flexible'): @@ -470,7 +473,7 @@ def __hash__(self): missing = keys ^ keys_hashed if missing: raise ValueError(f'Trying to hash a dataset with unhashed columns: {missing} (tip: use dataset.hashed())') - return hash(self._ids) + return hash(tuple(self._ids.items())) def _default_lazy_chunk_iterator(self, array_map, columns, chunk_size, reverse=False): chunk_size = chunk_size or 1024**2 @@ -1310,8 +1313,10 @@ def id(self): @property def _fingerprint(self): - hash = str(self.__hash__()) - return f'dataset-{self.snake_name}-hashed-{hash}' + self.__hash__() # invoke just to check we don't have missing hashes + # but Python's hash functions are not deterministic (cross processs) + fp = vaex.cache.fingerprint(tuple(self._ids.items())) + return f'dataset-{self.snake_name}-hashed-{fp}' def leafs(self) -> List[Dataset]: return [self] diff --git a/tests/dataset_test.py b/tests/dataset_test.py index 6fe8577aef..856c529554 100644 --- a/tests/dataset_test.py +++ b/tests/dataset_test.py @@ -262,6 +262,11 @@ def test_drop(rebuild_dataset): assert ds1.hashed() == ds2.merged(ds3).hashed() assert rebuild_dataset(ds1).hashed() == rebuild_dataset(ds2.merged(ds3)).hashed() + ds1b = dataset.DatasetArrays(x=x, y=y) + assert ds1.fingerprint == ds1b.fingerprint + ds2b = ds1.dropped('x') + assert ds2.fingerprint == ds2b.fingerprint + def test_concat(rebuild_dataset): x = np.arange(10) diff --git a/tests/fingerprint_test.py b/tests/fingerprint_test.py index 2c14a9aa20..426da29689 100644 --- a/tests/fingerprint_test.py +++ b/tests/fingerprint_test.py @@ -77,3 +77,36 @@ def test_column_indexed(): x3 = vaex.column.ColumnIndexed(x**2, i) assert x1.fingerprint() == x2.fingerprint() assert x1.fingerprint() != x3.fingerprint() + + +# these fingerprints may change over time, they may change as we change versions +# but they should at least not change per Python version, OS or after restarts + +def test_dataset_arrays(): + x = np.arange(10, dtype='i4') + y = x**2 + ds = vaex.dataset.DatasetArrays(x=x, y=y, z=x+y) + assert dict(ds._ids) == { + 'x': '031385dd4f0d2ba1aba2aeab0ad7c99814c90c11e96e5bc7cc8bd72112556dff', + 'y': '4d48c88e587db8f3855eed9f5d5f51eea769451b7371ecf7bdee4e0258238631', + 'z': 'a4cead13bef1fd1ec5974d1a2f5ceffd243a7aa6c6b08b80e09a7454b7d04293' + } + assert ds.fingerprint == 'dataset-arrays-hashed-88244cf38fe91c6bf435caa6160b089b' + + +def test_df(): + x = np.arange(10, dtype='i4') + y = x**2 + df = vaex.from_arrays(x=x, y=y, z=x+y) + assert df.fingerprint() == 'dataframe-8bff307fe39e9ebf0192181c5b3c933d' + + +def test_df_project(): + x = np.arange(10, dtype='i4') + y = x**2 + df = vaex.from_arrays(x=x, y=y, z1=x+y, z2=x-y) + # projecting 2 columns will drop 2 columns, which could be done in different order + df_a = df[['x', 'y']] + df_b = df[['x', 'y']] + assert df_a.fingerprint() == df_b.fingerprint() + assert df_a.fingerprint() == 'dataframe-bfeb0df610e25228f5693e68da946992'