Skip to content

Commit

Permalink
Merge main into sweep/fix-sweep-gha
Browse files Browse the repository at this point in the history
  • Loading branch information
sweep-ai[bot] committed Jan 22, 2024
2 parents 6786717 + c7d5763 commit 3413c24
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
10 changes: 7 additions & 3 deletions orderly/clean/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,11 @@ def get_matching_indices(
LOG.info(
f"preparing to move rows from test to train set based on {reactant_columns=} and {product_columns=}"
)

# Need to fillna with "NULL" so that the matching works
for col in reactant_columns + product_columns:
df[col] = df[col].fillna("NULL")

# Get reaction 'hashes'
reaction_hashes = [
".".join(
Expand Down Expand Up @@ -1347,11 +1352,10 @@ def main(
reactant_columns = list(df.columns[df.columns.str.startswith("reactant")])
product_columns = list(df.columns[df.columns.str.startswith("product")])

for col in reactant_columns + product_columns:
df[col] = df[col].fillna("NULL")
df_for_matching = df.copy()

matching_indices = get_matching_indices(
df, train_indices, test_indices, reactant_columns, product_columns
df_for_matching, train_indices, test_indices, reactant_columns, product_columns
)

# drop the matching rows from the test set
Expand Down
4 changes: 4 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,10 @@ def test_get_cleaned_df(

cleaned_df, _ = copy.copy(cleaned_df_params_default)
assert not cleaned_df.empty

# check that "NULL" is not in the dataframe
assert not "NULL" in cleaned_df.values

# TODO: check that there's only NaN or NaT, but no None


Expand Down

0 comments on commit 3413c24

Please sign in to comment.