🐛 fingerprints using df.copy and projecting were not deterministic

vaexio · Jun 17, 2021 · 2986d10 · 2986d10
1 parent 9dd7e0e
commit 2986d10
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 3 deletions.
diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py
@@ -5631,6 +5631,9 @@ def track(name):
 
             # first create the DataFrame with real data (dataset)
             dataset_columns = {k for k in required if k in self.dataset}
+            # we want a deterministic order for fingerprinting
+            dataset_columns = list(dataset_columns)
+            dataset_columns.sort()
             dataset = self.dataset.project(*dataset_columns)
             df = vaex.from_dataset(dataset)
 

diff --git a/packages/vaex-core/vaex/dataset.py b/packages/vaex-core/vaex/dataset.py
@@ -403,6 +403,9 @@ def row_count(self):
     def project(self, *names):
         all = set(self)
         drop = all - set(names)
+        # we want a deterministic order for fingerprints
+        drop = list(drop)
+        drop.sort()
         return self.dropped(*list(drop))
 
     def concat(self, *others, resolver='flexible'):
@@ -470,7 +473,7 @@ def __hash__(self):
         missing = keys ^ keys_hashed
         if missing:
             raise ValueError(f'Trying to hash a dataset with unhashed columns: {missing} (tip: use dataset.hashed())')
-        return hash(self._ids)
+        return hash(tuple(self._ids.items()))
 
     def _default_lazy_chunk_iterator(self, array_map, columns, chunk_size, reverse=False):
         chunk_size = chunk_size or 1024**2
@@ -1310,8 +1313,10 @@ def id(self):
 
     @property
     def _fingerprint(self):
-        hash = str(self.__hash__())
-        return f'dataset-{self.snake_name}-hashed-{hash}'
+        self.__hash__()  # invoke just to check we don't have missing hashes
+        # but Python's hash functions are not deterministic (cross processs)
+        fp = vaex.cache.fingerprint(tuple(self._ids.items()))
+        return f'dataset-{self.snake_name}-hashed-{fp}'
 
     def leafs(self) -> List[Dataset]:
         return [self]

diff --git a/tests/dataset_test.py b/tests/dataset_test.py
@@ -262,6 +262,11 @@ def test_drop(rebuild_dataset):
     assert ds1.hashed() == ds2.merged(ds3).hashed()
     assert rebuild_dataset(ds1).hashed() == rebuild_dataset(ds2.merged(ds3)).hashed()
 
+    ds1b = dataset.DatasetArrays(x=x, y=y)
+    assert ds1.fingerprint == ds1b.fingerprint
+    ds2b = ds1.dropped('x')
+    assert ds2.fingerprint == ds2b.fingerprint
+
 
 def test_concat(rebuild_dataset):
     x = np.arange(10)

diff --git a/tests/fingerprint_test.py b/tests/fingerprint_test.py
@@ -77,3 +77,36 @@ def test_column_indexed():
     x3 = vaex.column.ColumnIndexed(x**2, i)
     assert x1.fingerprint() == x2.fingerprint()
     assert x1.fingerprint() != x3.fingerprint()
+
+
+# these fingerprints may change over time, they may change as we change versions
+# but they should at least not change per Python version, OS or after restarts
+
+def test_dataset_arrays():
+    x = np.arange(10, dtype='i4')
+    y = x**2
+    ds = vaex.dataset.DatasetArrays(x=x, y=y, z=x+y)
+    assert dict(ds._ids) == {
+        'x': '031385dd4f0d2ba1aba2aeab0ad7c99814c90c11e96e5bc7cc8bd72112556dff',
+        'y': '4d48c88e587db8f3855eed9f5d5f51eea769451b7371ecf7bdee4e0258238631',
+        'z': 'a4cead13bef1fd1ec5974d1a2f5ceffd243a7aa6c6b08b80e09a7454b7d04293'
+    }
+    assert ds.fingerprint == 'dataset-arrays-hashed-88244cf38fe91c6bf435caa6160b089b'
+
+
+def test_df():
+    x = np.arange(10, dtype='i4')
+    y = x**2
+    df = vaex.from_arrays(x=x, y=y, z=x+y)
+    assert df.fingerprint() == 'dataframe-8bff307fe39e9ebf0192181c5b3c933d'
+
+
+def test_df_project():
+    x = np.arange(10, dtype='i4')
+    y = x**2
+    df = vaex.from_arrays(x=x, y=y, z1=x+y, z2=x-y)
+    # projecting 2 columns will drop 2 columns, which could be done in different order
+    df_a = df[['x', 'y']]
+    df_b = df[['x', 'y']]
+    assert df_a.fingerprint() == df_b.fingerprint()
+    assert df_a.fingerprint() == 'dataframe-bfeb0df610e25228f5693e68da946992'