Skip to content

Commit

Permalink
Edit default total_row_number in DatasetProfile. Change the way of ge…
Browse files Browse the repository at this point in the history
…tting the row values when logging a dataframe.
  • Loading branch information
pecop2 committed Jan 26, 2022
1 parent 1c73b52 commit da4bf16
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 8 deletions.
1 change: 0 additions & 1 deletion src/whylogs/core/columnprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
)
from whylogs.core.statistics.constraints import (
MultiColumnValueConstraints,
SummaryConstraint,
SummaryConstraints,
ValueConstraints,
columnMostCommonValueInSetConstraint,
Expand Down
16 changes: 9 additions & 7 deletions src/whylogs/core/datasetprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ def session_timestamp_ms(self):

@property
def total_row_number(self):
return max(self.column_row_dict.values())
dict_counts = self.column_row_dict.values() if len(self.column_row_dict) else [0]
return max(dict_counts)

def add_output_field(self, field: Union[str, List[str]]):
if self.model_profile is None:
Expand Down Expand Up @@ -311,17 +312,18 @@ def track_dataframe(self, df: pd.DataFrame, character_list=None, token_method=No
large_df = element_count > 200000
if large_df:
logger.warning(f"About to log a dataframe with {element_count} elements, logging might take some time to complete.")

count = 0

columns_len = len(df.columns)
num_records = len(df)
for idx in range(num_records):
row_values = df.iloc[idx].values
row_values = []
count += 1
for col_idx in range(columns_len):
col = df.columns[col_idx]
col_str = str(col)
self.track(col_str, row_values[col_idx], character_list=None, token_method=None)
for col in df.columns:
col_values = df[col].values
value = col_values[idx]
row_values.append(value)
self.track(col, value, character_list=None, token_method=None)
if large_df and (count % 200000 == 0):
logger.warning(f"Logged {count} elements out of {element_count}")

Expand Down
2 changes: 2 additions & 0 deletions tests/unit/core/statistics/test_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -2395,6 +2395,8 @@ def test_generate_default_constraints_mixed(local_config_path):
assert followers_constraints[2]["name"] == "summary column_values_type EQ INTEGRAL"
assert followers_constraints[3]["name"] == "summary unique_count BTWN 3 and 5" # we have 4 unique values in the df
assert "summary most_common_value IN" in followers_constraints[4]["name"]


def _apply_value_constraints_on_dataset(df_lending_club, local_config_path, value_constraints=None, multi_column_value_constraints=None):
dc = DatasetConstraints(None, value_constraints=value_constraints, multi_column_value_constraints=multi_column_value_constraints)
config = load_config(local_config_path)
Expand Down

0 comments on commit da4bf16

Please sign in to comment.