Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

馃悰 fingerprints using df.copy and projecting were not deterministic #1409

Merged
merged 1 commit into from Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions packages/vaex-core/vaex/dataframe.py
Expand Up @@ -5631,6 +5631,9 @@ def track(name):

# first create the DataFrame with real data (dataset)
dataset_columns = {k for k in required if k in self.dataset}
# we want a deterministic order for fingerprinting
dataset_columns = list(dataset_columns)
dataset_columns.sort()
dataset = self.dataset.project(*dataset_columns)
df = vaex.from_dataset(dataset)

Expand Down
11 changes: 8 additions & 3 deletions packages/vaex-core/vaex/dataset.py
Expand Up @@ -403,6 +403,9 @@ def row_count(self):
def project(self, *names):
all = set(self)
drop = all - set(names)
# we want a deterministic order for fingerprints
drop = list(drop)
drop.sort()
return self.dropped(*list(drop))

def concat(self, *others, resolver='flexible'):
Expand Down Expand Up @@ -470,7 +473,7 @@ def __hash__(self):
missing = keys ^ keys_hashed
if missing:
raise ValueError(f'Trying to hash a dataset with unhashed columns: {missing} (tip: use dataset.hashed())')
return hash(self._ids)
return hash(tuple(self._ids.items()))

def _default_lazy_chunk_iterator(self, array_map, columns, chunk_size, reverse=False):
chunk_size = chunk_size or 1024**2
Expand Down Expand Up @@ -1310,8 +1313,10 @@ def id(self):

@property
def _fingerprint(self):
hash = str(self.__hash__())
return f'dataset-{self.snake_name}-hashed-{hash}'
self.__hash__() # invoke just to check we don't have missing hashes
# but Python's hash functions are not deterministic (cross processs)
fp = vaex.cache.fingerprint(tuple(self._ids.items()))
return f'dataset-{self.snake_name}-hashed-{fp}'

def leafs(self) -> List[Dataset]:
return [self]
Expand Down
5 changes: 5 additions & 0 deletions tests/dataset_test.py
Expand Up @@ -262,6 +262,11 @@ def test_drop(rebuild_dataset):
assert ds1.hashed() == ds2.merged(ds3).hashed()
assert rebuild_dataset(ds1).hashed() == rebuild_dataset(ds2.merged(ds3)).hashed()

ds1b = dataset.DatasetArrays(x=x, y=y)
assert ds1.fingerprint == ds1b.fingerprint
ds2b = ds1.dropped('x')
assert ds2.fingerprint == ds2b.fingerprint


def test_concat(rebuild_dataset):
x = np.arange(10)
Expand Down
33 changes: 33 additions & 0 deletions tests/fingerprint_test.py
Expand Up @@ -77,3 +77,36 @@ def test_column_indexed():
x3 = vaex.column.ColumnIndexed(x**2, i)
assert x1.fingerprint() == x2.fingerprint()
assert x1.fingerprint() != x3.fingerprint()


# these fingerprints may change over time, they may change as we change versions
# but they should at least not change per Python version, OS or after restarts

def test_dataset_arrays():
x = np.arange(10, dtype='i4')
y = x**2
ds = vaex.dataset.DatasetArrays(x=x, y=y, z=x+y)
assert dict(ds._ids) == {
'x': '031385dd4f0d2ba1aba2aeab0ad7c99814c90c11e96e5bc7cc8bd72112556dff',
'y': '4d48c88e587db8f3855eed9f5d5f51eea769451b7371ecf7bdee4e0258238631',
'z': 'a4cead13bef1fd1ec5974d1a2f5ceffd243a7aa6c6b08b80e09a7454b7d04293'
}
assert ds.fingerprint == 'dataset-arrays-hashed-88244cf38fe91c6bf435caa6160b089b'


def test_df():
x = np.arange(10, dtype='i4')
y = x**2
df = vaex.from_arrays(x=x, y=y, z=x+y)
assert df.fingerprint() == 'dataframe-8bff307fe39e9ebf0192181c5b3c933d'


def test_df_project():
x = np.arange(10, dtype='i4')
y = x**2
df = vaex.from_arrays(x=x, y=y, z1=x+y, z2=x-y)
# projecting 2 columns will drop 2 columns, which could be done in different order
df_a = df[['x', 'y']]
df_b = df[['x', 'y']]
assert df_a.fingerprint() == df_b.fingerprint()
assert df_a.fingerprint() == 'dataframe-bfeb0df610e25228f5693e68da946992'