Skip to content

Commit

Permalink
Merge branch 'mainline' into dev/richard/javasmoketest
Browse files Browse the repository at this point in the history
  • Loading branch information
jamie256 committed Feb 15, 2022
2 parents a25284c + 37cec2f commit 4330535
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 93 deletions.
2 changes: 1 addition & 1 deletion proto/src/constraints.proto
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,6 @@ message DatasetConstraintMsg {
map<string, ValueConstraintMsgs> value_constraints = 2;
map<string, SummaryConstraintMsgs> summary_constraints = 3;
SummaryConstraintMsgs table_shape_constraints = 4;
map<string, ValueConstraintMsgs> multi_column_value_constraints = 5;
ValueConstraintMsgs multi_column_value_constraints = 5;
}

8 changes: 2 additions & 6 deletions src/whylogs/core/datasetprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,7 @@
from whylogs.core import ColumnProfile, MultiColumnProfile
from whylogs.core.flatten_datasetprofile import flatten_summary
from whylogs.core.model_profile import ModelProfile
from whylogs.core.statistics.constraints import (
DatasetConstraints,
MultiColumnValueConstraints,
SummaryConstraints,
)
from whylogs.core.statistics.constraints import DatasetConstraints, SummaryConstraints
from whylogs.core.summaryconverters import entropy_from_column_summary
from whylogs.core.types import TypedDataConverter
from whylogs.proto import (
Expand Down Expand Up @@ -101,7 +97,7 @@ def __init__(
if columns is None:
columns = {}
if multi_columns is None:
multi_column_constraints = MultiColumnValueConstraints(constraints.multi_column_value_constraints) if constraints else None
multi_column_constraints = constraints.multi_column_value_constraints if constraints else None
multi_columns = MultiColumnProfile(multi_column_constraints)
if tags is None:
tags = {}
Expand Down
24 changes: 13 additions & 11 deletions src/whylogs/core/statistics/constraints.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import datetime
import json
import logging
Expand Down Expand Up @@ -264,7 +265,7 @@ def name(self):
return self._name
if self.op == Op.APPLY_FUNC:
val_or_funct = self.apply_function.__name__
elif getattr(self, "value", None) is not None:
elif hasattr(self, "value"):
val_or_funct = self.value
else:
val_or_funct = self.regex_pattern
Expand Down Expand Up @@ -312,10 +313,10 @@ def merge(self, other) -> "ValueConstraint":
elif hasattr(self, "value") and hasattr(other, "value"):
val = self.value
assert self.value == other.value, f"Cannot merge value constraints with different values: {self.value} and {other.value}"
elif all([getattr(v, "value", None) is not None for v in (self, other)]):
elif all([hasattr(v, "value") for v in (self, other)]):
val = self.value
assert self.value == other.value, f"Cannot merge value constraints with different values: {self.value} and {other.value}"
elif all([getattr(v, "regex_pattern", None) for v in (self, other)]):
elif all([hasattr(v, "regex_pattern") for v in (self, other)]):
pattern = self.regex_pattern
assert (
self.regex_pattern == other.regex_pattern
Expand Down Expand Up @@ -1100,7 +1101,8 @@ def name(self):
dependent_cols = Op.Name(self.internal_op) + " " + dependent_cols
return f"multi column value {dependent_cols} {Op.Name(self.op)} {val_or_ref_columns}"

def update(self, columns):
def update(self, column_values_dictionary):
columns = copy.deepcopy(column_values_dictionary)
self.total += 1
if isinstance(self.dependent_columns, str):
v1 = columns[self.dependent_columns]
Expand Down Expand Up @@ -1342,10 +1344,10 @@ def __init__(
self.table_shape_constraints = table_shape_constraints

if multi_column_value_constraints is None:
multi_column_value_constraints = list()
for i, v in enumerate(multi_column_value_constraints):
if isinstance(v, list):
multi_column_value_constraints[i] = MultiColumnValueConstraints(v)
multi_column_value_constraints = MultiColumnValueConstraints()

if isinstance(multi_column_value_constraints, list):
multi_column_value_constraints = MultiColumnValueConstraints(multi_column_value_constraints)

self.multi_column_value_constraints = multi_column_value_constraints

Expand All @@ -1359,7 +1361,7 @@ def from_protobuf(msg: DatasetConstraintMsg) -> "DatasetConstraints":
vm = dict([(k, ValueConstraints.from_protobuf(v)) for k, v in msg.value_constraints.items()])
sm = dict([(k, SummaryConstraints.from_protobuf(v)) for k, v in msg.summary_constraints.items()])
table_shape_m = SummaryConstraints.from_protobuf(msg.table_shape_constraints)
multi_column_value_m = dict([(k, MultiColumnValueConstraints.from_protobuf(v)) for k, v in msg.multi_column_value_constraints.items()])
multi_column_value_m = MultiColumnValueConstraints.from_protobuf(msg.multi_column_value_constraints)
return DatasetConstraints(msg.properties, vm, sm, table_shape_m, multi_column_value_m)

@staticmethod
Expand All @@ -1373,7 +1375,7 @@ def to_protobuf(self) -> DatasetConstraintMsg:
vm = dict([(k, v.to_protobuf()) for k, v in self.value_constraint_map.items()])
sm = dict([(k, s.to_protobuf()) for k, s in self.summary_constraint_map.items()])
table_shape_constraints_message = self.table_shape_constraints.to_protobuf()
multi_column_value_m = [v.to_protobuf() for v in self.multi_column_value_constraints]
multi_column_value_m = self.multi_column_value_constraints.to_protobuf()
return DatasetConstraintMsg(
properties=self.dataset_properties,
value_constraints=vm,
Expand All @@ -1389,7 +1391,7 @@ def report(self):
l1 = [(k, v.report()) for k, v in self.value_constraint_map.items()]
l2 = [(k, s.report()) for k, s in self.summary_constraint_map.items()]
l3 = self.table_shape_constraints.report() if self.table_shape_constraints.report() else []
l4 = [mc.report() for mc in self.multi_column_value_constraints]
l4 = self.multi_column_value_constraints.report() if self.multi_column_value_constraints.report() else []
return l1 + l2 + l3 + l4


Expand Down
15 changes: 10 additions & 5 deletions src/whylogs/core/summaryconverters.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,11 +320,12 @@ def _compute_kl_divergence_continuous_distributions(target_distribution: kll_flo
The estimated KL divergence between two continuous features.
"""

almost_zero_probability_of_event = 10e-5
bins_target = np.linspace(target_distribution.get_min_value(), target_distribution.get_max_value(), 100)
pmf_target = np.array(target_distribution.get_pmf(bins_target))

pmf_reference = np.array(reference_distribution.get_pmf(bins_target))
pmf_reference[pmf_reference == 0] = almost_zero_probability_of_event

kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))
return type("Object", (), {"kl_divergence": kl_divergence})
Expand Down Expand Up @@ -377,6 +378,9 @@ def _compute_kl_divergence_discrete_distributions(
if i_frequency == 0:
continue
ref_frequency = ref_freq_items_map[item.json_value] / ref_total_count if item.json_value in ref_freq_items_map.keys() else 0
if ref_frequency == 0:
kl_divergence = np.inf
break
kl_divergence += i_frequency * np.log(i_frequency / ref_frequency)

target_frequent_items_count = len(target_frequent_items.items)
Expand Down Expand Up @@ -431,13 +435,14 @@ def compute_chi_squared_test_p_value(target_distribution: ReferenceDistributionD

chi_sq = 0
for item in target_freq_items.items:
i_frequency = item.estimate / target_total_count
ref_frequency = ref_dist_items[item.json_value] / ref_total_count if item.json_value in ref_dist_items.keys() else 0
target_frequency = item.estimate
ref_frequency = ref_dist_items[item.json_value] if item.json_value in ref_dist_items else 0
if ref_frequency == 0:
chi_sq = np.inf
else:
chi_sq += (i_frequency - ref_frequency) ** 2 / ref_frequency
break
chi_sq += (target_frequency - ref_frequency) ** 2 / ref_frequency

degrees_of_freedom = target_unique_count - 1
degrees_of_freedom = degrees_of_freedom if degrees_of_freedom > 0 else 1
p_value = stats.chi2.sf(chi_sq, degrees_of_freedom)
return type("Object", (), {"chi_squared_test": p_value})
159 changes: 89 additions & 70 deletions tests/unit/core/statistics/test_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -1553,8 +1553,7 @@ def test_table_shape_serialization():
assert ts1_json["verbose"] == ts2_json["verbose"]


def test_dataset_constraints_serialization():

def _get_sample_dataset_constraints():
cvisc = columnValuesInSetConstraint(value_set={2, 5, 8})
ltc = ValueConstraint(Op.LT, 1)

Expand All @@ -1564,17 +1563,26 @@ def test_dataset_constraints_serialization():
set1 = set(["col1", "col2"])
columns_match_constraint = columnsMatchSetConstraint(set1)

dc = DatasetConstraints(
val_set = {(1, 2), (3, 5)}
col_set = ["A", "B"]
mcv_constraints = [
columnValuesUniqueWithinRow(column_A="A", verbose=True),
columnPairValuesInSetConstraint(column_A="A", column_B="B", value_set=val_set),
sumOfRowValuesOfMultipleColumnsEqualsConstraint(columns=col_set, value=100),
]

return DatasetConstraints(
None,
value_constraints={"annual_inc": [cvisc, ltc]},
summary_constraints={"annual_inc": [max_le_constraint, min_gt_constraint]},
table_shape_constraints=[columns_match_constraint],
multi_column_value_constraints=mcv_constraints,
)

dc_deser = DatasetConstraints.from_protobuf(dc.to_protobuf())

def _assert_dc_props_equal(dc, dc_deserialized):
props = dc.dataset_properties
deser_props = dc_deser.dataset_properties
deser_props = dc_deserialized.dataset_properties

if all([props, deser_props]):
pm_json = json.loads(message_to_json(props))
Expand All @@ -1587,6 +1595,39 @@ def test_dataset_constraints_serialization():
v_deser = v_deser.sort() if isinstance(v_deser, list) else v_deser
assert v == v_deser


def _assert_constraints_equal(constraints, deserialized_constraints):
for (name, c), (deser_name, deser_c) in zip(constraints.items(), deserialized_constraints.items()):
assert name == deser_name

a = json.loads(message_to_json(c.to_protobuf()))
b = json.loads(message_to_json(deser_c.to_protobuf()))

for (k, v), (k_deser, v_deser) in zip(a.items(), b.items()):
assert k == k_deser
if all([v, v_deser]):
v = v.sort() if isinstance(v, list) else v
v_deser = v_deser.sort() if isinstance(v_deser, list) else v_deser
assert v == v_deser


def _get_all_value_constraints(constraints):

all_v_constraints = dict()
all_v_constraints.update(constraints.raw_value_constraints)
all_v_constraints.update(constraints.coerced_type_constraints)

return all_v_constraints


def test_dataset_constraints_serialization():

dc = _get_sample_dataset_constraints()

dc_deser = DatasetConstraints.from_protobuf(dc.to_protobuf())

_assert_dc_props_equal(dc, dc_deser)

value_constraints = dc.value_constraint_map
summary_constraints = dc.summary_constraint_map
table_shape_constraints = dc.table_shape_constraints
Expand All @@ -1600,77 +1641,22 @@ def test_dataset_constraints_serialization():
for (column, constraints), (deser_column, deser_constraints) in zip(value_constraints.items(), deser_v_c.items()):
assert column == deser_column

all_constraints = dict()
all_constraints.update(constraints.raw_value_constraints)
all_constraints.update(constraints.coerced_type_constraints)

all_constraints_deser = dict()
all_constraints_deser.update(deser_constraints.raw_value_constraints)
all_constraints_deser.update(deser_constraints.coerced_type_constraints)

for (name, c), (deser_name, deser_c) in zip(all_constraints.items(), all_constraints_deser.items()):
assert name == deser_name

a = json.loads(message_to_json(c.to_protobuf()))
b = json.loads(message_to_json(deser_c.to_protobuf()))
all_v_constraints = _get_all_value_constraints(constraints)
all_v_constraints_deser = _get_all_value_constraints(deser_constraints)

for (k, v), (k_deser, v_deser) in zip(a.items(), b.items()):
assert k == k_deser
if all([v, v_deser]):
v = v.sort() if isinstance(v, list) else v
v_deser = v_deser.sort() if isinstance(v_deser, list) else v_deser
assert v == v_deser
_assert_constraints_equal(all_v_constraints, all_v_constraints_deser)

for (column, constraints), (deser_column, deser_constraints) in zip(summary_constraints.items(), deser_s_c.items()):
assert column == deser_column

for (name, c), (deser_name, deser_c) in zip(constraints.constraints.items(), deser_constraints.constraints.items()):
assert name == deser_name
_assert_constraints_equal(constraints.constraints, deser_constraints.constraints)

a = json.loads(message_to_json(c.to_protobuf()))
b = json.loads(message_to_json(deser_c.to_protobuf()))
_assert_constraints_equal(table_shape_constraints.constraints, deser_ts_c.constraints)

for (k, v), (k_deser, v_deser) in zip(a.items(), b.items()):
assert k == k_deser
if all([v, v_deser]):
v = v.sort() if isinstance(v, list) else v
v_deser = v_deser.sort() if isinstance(v_deser, list) else v_deser
assert v == v_deser
all_mc_constraints = _get_all_value_constraints(multi_column_value_constraints)
all_mc_constraints_deser = _get_all_value_constraints(deser_mcv_c)

for (name, c), (deser_name, deser_c) in zip(table_shape_constraints.constraints.items(), deser_ts_c.constraints.items()):
assert name == deser_name

a = json.loads(message_to_json(c.to_protobuf()))
b = json.loads(message_to_json(deser_c.to_protobuf()))

for (k, v), (k_deser, v_deser) in zip(a.items(), b.items()):
assert k == k_deser
if all([v, v_deser]):
v = v.sort() if isinstance(v, list) else v
v_deser = v_deser.sort() if isinstance(v_deser, list) else v_deser
assert v == v_deser

for (mcvc, deser_mcvc) in zip(multi_column_value_constraints, deser_mcv_c):
all_constraints = dict()
all_constraints.update(mcvc.raw_value_constraints)
all_constraints.update(mcvc.coerced_type_constraints)

all_constraints_deser = dict()
all_constraints_deser.update(deser_mcvc.raw_value_constraints)
all_constraints_deser.update(deser_mcvc.coerced_type_constraints)

for (name, c), (deser_name, deser_c) in zip(all_constraints.items(), all_constraints_deser.items()):
assert name == deser_name

a = json.loads(message_to_json(c.to_protobuf()))
b = json.loads(message_to_json(deser_c.to_protobuf()))

for (k, v), (k_deser, v_deser) in zip(a.items(), b.items()):
assert k == k_deser
if all([v, v_deser]):
v = v.sort() if isinstance(v, list) else v
v_deser = v_deser.sort() if isinstance(v_deser, list) else v_deser
assert v == v_deser
_assert_constraints_equal(all_mc_constraints, all_mc_constraints_deser)

report = dc.report()
report_deser = dc_deser.report()
Expand Down Expand Up @@ -2280,7 +2266,7 @@ def test_column_kl_divergence_less_than_constraint_wrong_datatype():


def test_chi_squared_test_p_value_greater_than_constraint_true(df_lending_club, local_config_path):
test_values = ["A", "A", "B", "C", "C", "C", "C", "D", "D", "E", "F"]
test_values = ["A"] * 6 + ["B"] * 13 + ["C"] * 25 + ["D"] * 3 + ["E"] + ["F"] * 2
kspval = columnChiSquaredTestPValueGreaterThanConstraint(test_values, p_value=0.1)
dc = DatasetConstraints(None, summary_constraints={"grade": [kspval]})
config = load_config(local_config_path)
Expand Down Expand Up @@ -2889,6 +2875,38 @@ def test_column_values_unique_within_row_constraint_invalid_params():
columnValuesUniqueWithinRow(column_A=["A"])


def test_multicolumn_value_constraints_report(local_config_path):
data = pd.DataFrame(
{
"A": [50, 23, 42, 11],
"B": [52, 77, 58, 100],
}
)

val_set = {(1, 2), (3, 5)}
col_set = ["A", "B"]
constraints = [
columnValuesUniqueWithinRow(column_A="A", verbose=True),
columnPairValuesInSetConstraint(column_A="A", column_B="B", value_set=val_set),
sumOfRowValuesOfMultipleColumnsEqualsConstraint(columns=col_set, value=100),
]
mcvc = MultiColumnValueConstraints(constraints)

report = _apply_value_constraints_on_dataset(data, local_config_path, multi_column_value_constraints=mcvc)
assert len(report) == 3
assert report[0][0] == "The values of the column A are unique within each row"
assert report[0][1] == 4
assert report[0][2] == 0

assert report[1][0] == f"The pair of values of the columns A and B are in {val_set}"
assert report[1][1] == 4
assert report[1][2] == 4

assert report[2][0] == "The sum of the values of A and B is equal to 100"
assert report[2][1] == 4
assert report[2][2] == 2


def test_multicolumn_value_constraints_serialization_deserialization():
val_set = {(1, 2), (3, 5)}
col_set = ["A", "B"]
Expand All @@ -2899,12 +2917,13 @@ def test_multicolumn_value_constraints_serialization_deserialization():
]
mcvc = MultiColumnValueConstraints(constraints)

mcvc.from_protobuf(mcvc.to_protobuf())
mcvc = MultiColumnValueConstraints.from_protobuf(mcvc.to_protobuf())
json_value = json.loads(message_to_json(mcvc.to_protobuf()))
multi_column_constraints = json_value["multiColumnConstraints"]
unique = multi_column_constraints[0]
pair_values = multi_column_constraints[1]
sum_of_values = multi_column_constraints[2]

assert len(multi_column_constraints) == 3

assert unique["name"] == f"The values of the column A are unique within each row"
Expand Down

0 comments on commit 4330535

Please sign in to comment.