Skip to content

Commit

Permalink
馃悰 fingerprints using df.copy and projecting were not deterministic
Browse files Browse the repository at this point in the history
  • Loading branch information
maartenbreddels committed Jun 17, 2021
1 parent 9dd7e0e commit 2986d10
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 3 deletions.
3 changes: 3 additions & 0 deletions packages/vaex-core/vaex/dataframe.py
Expand Up @@ -5631,6 +5631,9 @@ def track(name):

# first create the DataFrame with real data (dataset)
dataset_columns = {k for k in required if k in self.dataset}
# we want a deterministic order for fingerprinting
dataset_columns = list(dataset_columns)
dataset_columns.sort()
dataset = self.dataset.project(*dataset_columns)
df = vaex.from_dataset(dataset)

Expand Down
11 changes: 8 additions & 3 deletions packages/vaex-core/vaex/dataset.py
Expand Up @@ -403,6 +403,9 @@ def row_count(self):
def project(self, *names):
all = set(self)
drop = all - set(names)
# we want a deterministic order for fingerprints
drop = list(drop)
drop.sort()
return self.dropped(*list(drop))

def concat(self, *others, resolver='flexible'):
Expand Down Expand Up @@ -470,7 +473,7 @@ def __hash__(self):
missing = keys ^ keys_hashed
if missing:
raise ValueError(f'Trying to hash a dataset with unhashed columns: {missing} (tip: use dataset.hashed())')
return hash(self._ids)
return hash(tuple(self._ids.items()))

def _default_lazy_chunk_iterator(self, array_map, columns, chunk_size, reverse=False):
chunk_size = chunk_size or 1024**2
Expand Down Expand Up @@ -1310,8 +1313,10 @@ def id(self):

@property
def _fingerprint(self):
hash = str(self.__hash__())
return f'dataset-{self.snake_name}-hashed-{hash}'
self.__hash__() # invoke just to check we don't have missing hashes
# but Python's hash functions are not deterministic (cross processs)
fp = vaex.cache.fingerprint(tuple(self._ids.items()))
return f'dataset-{self.snake_name}-hashed-{fp}'

def leafs(self) -> List[Dataset]:
return [self]
Expand Down
5 changes: 5 additions & 0 deletions tests/dataset_test.py
Expand Up @@ -262,6 +262,11 @@ def test_drop(rebuild_dataset):
assert ds1.hashed() == ds2.merged(ds3).hashed()
assert rebuild_dataset(ds1).hashed() == rebuild_dataset(ds2.merged(ds3)).hashed()

ds1b = dataset.DatasetArrays(x=x, y=y)
assert ds1.fingerprint == ds1b.fingerprint
ds2b = ds1.dropped('x')
assert ds2.fingerprint == ds2b.fingerprint


def test_concat(rebuild_dataset):
x = np.arange(10)
Expand Down
33 changes: 33 additions & 0 deletions tests/fingerprint_test.py
Expand Up @@ -77,3 +77,36 @@ def test_column_indexed():
x3 = vaex.column.ColumnIndexed(x**2, i)
assert x1.fingerprint() == x2.fingerprint()
assert x1.fingerprint() != x3.fingerprint()


# these fingerprints may change over time, they may change as we change versions
# but they should at least not change per Python version, OS or after restarts

def test_dataset_arrays():
x = np.arange(10, dtype='i4')
y = x**2
ds = vaex.dataset.DatasetArrays(x=x, y=y, z=x+y)
assert dict(ds._ids) == {
'x': '031385dd4f0d2ba1aba2aeab0ad7c99814c90c11e96e5bc7cc8bd72112556dff',
'y': '4d48c88e587db8f3855eed9f5d5f51eea769451b7371ecf7bdee4e0258238631',
'z': 'a4cead13bef1fd1ec5974d1a2f5ceffd243a7aa6c6b08b80e09a7454b7d04293'
}
assert ds.fingerprint == 'dataset-arrays-hashed-88244cf38fe91c6bf435caa6160b089b'


def test_df():
x = np.arange(10, dtype='i4')
y = x**2
df = vaex.from_arrays(x=x, y=y, z=x+y)
assert df.fingerprint() == 'dataframe-8bff307fe39e9ebf0192181c5b3c933d'


def test_df_project():
x = np.arange(10, dtype='i4')
y = x**2
df = vaex.from_arrays(x=x, y=y, z1=x+y, z2=x-y)
# projecting 2 columns will drop 2 columns, which could be done in different order
df_a = df[['x', 'y']]
df_b = df[['x', 'y']]
assert df_a.fingerprint() == df_b.fingerprint()
assert df_a.fingerprint() == 'dataframe-bfeb0df610e25228f5693e68da946992'

0 comments on commit 2986d10

Please sign in to comment.