diff --git a/examples/Constraints_Suite.ipynb b/examples/Constraints_Suite.ipynb new file mode 100644 index 0000000000..0c28b02ee1 --- /dev/null +++ b/examples/Constraints_Suite.ipynb @@ -0,0 +1,2645 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f13777a7", + "metadata": {}, + "source": [ + "### There is a specific function for common constraints. Should only continue to use the ValueConstraint and SummaryConstraint for creating a custom constraint that can't be found." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b32c0dc0", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARN: Missing config\n" + ] + } + ], + "source": [ + "from whylogs import get_or_create_session\n", + "from whylogs.util.protobuf import message_to_json\n", + "\n", + "# create session\n", + "session = get_or_create_session()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e0591c0c", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f5ba8afd", + "metadata": {}, + "outputs": [], + "source": [ + "from tabulate import tabulate\n", + "\n", + "def indent(txt, spaces=4):\n", + " return \"\\n\".join(\" \" * spaces + ln for ln in txt.splitlines())\n", + "\n", + "def format_report(r):\n", + " # report failures in tabular form\n", + " \n", + " r_2 = [entry for entry in r if len(entry)==2] # all the single column constraints\n", + " r_table_shape = [[entry for entry in r if len(entry)!=2 and entry[0].startswith(\"table\")]] # multi column and table shape constraints\n", + " r_multi_column = [[entry for entry in r if len(entry)!=2 and entry[0].startswith(\"multi column\")]]\n", + " \n", + " if len(r_2):\n", + " print(\"Constraint failures by feature - \")\n", + " for c,r in r_2:\n", + " print(f\"{c}:\")\n", + " if len(r[0][0]) > 80: \n", + " print(f\"\\ntest_name:\\t{r[0][0]}\\n\")\n", + " print(f\"total_run:\\t{r[0][1]}\\n\")\n", + " print(f\"failed:\\t\\t{r[0][2]}\\n\")\n", + " else: \n", + " print(indent(tabulate(r, tablefmt=\"plain\", headers=['test_name', 'total_run', 'failed'])))\n", + " \n", + " if len(r_table_shape[0]):\n", + " print () \n", + " print(\"Table shape constraint failures -\")\n", + " for entry in r_table_shape:\n", + " print(indent(tabulate(entry, tablefmt=\"plain\", headers=['test_name', 'total_run', 'failed'])))\n", + " \n", + " if len(r_multi_column[0]):\n", + " print() \n", + " print(\"Multi column constraint failures -\")\n", + " for entry in r_multi_column:\n", + " if len(entry[0][0]) > 80: \n", + " print(f\"\\ntest_name:\\t{entry[0][0]}\\n\")\n", + " print(f\"total_run:\\t{entry[0][1]}\\n\")\n", + " print(f\"failed:\\t\\t{entry[0][2]}\\n\")\n", + " else:\n", + " print(indent(tabulate(entry, tablefmt=\"plain\", headers=['test_name', 'total_run', 'failed'])))\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "7b6068d2", + "metadata": {}, + "source": [ + "## Between summary constraints on summary fields like: stddev, min, max, mean..." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bfd0b5a7", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " maxBetweenConstraint,\n", + " maxLessThanEqualConstraint,\n", + " meanBetweenConstraint,\n", + " minBetweenConstraint,\n", + " minGreaterThanEqualConstraint,\n", + " stddevBetweenConstraint,\n", + " stringLengthBetweenConstraint,\n", + " stringLengthEqualConstraint,\n", + " quantileBetweenConstraint,\n", + " DatasetConstraints,\n", + " SummaryConstraints,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "71c0b923", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraints for column 'col1': \n", + "[\n", + " {\n", + " \"name\": \"summary max BTWN 5 and 10.8\",\n", + " \"firstField\": \"max\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 5.0,\n", + " \"upperValue\": 10.8\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary stddev BTWN 2.3 and 5.4\",\n", + " \"firstField\": \"stddev\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 2.3,\n", + " \"upperValue\": 5.4\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary min GE 1/None\",\n", + " \"firstField\": \"min\",\n", + " \"value\": 1.0,\n", + " \"op\": \"GE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + "]\n", + "\n", + "Constraints for column 'col2': \n", + "[\n", + " {\n", + " \"name\": \"summary mean BTWN 1.2 and 1.6\",\n", + " \"firstField\": \"mean\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 1.2,\n", + " \"upperValue\": 1.6\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary min BTWN 0.1 and 0.5\",\n", + " \"firstField\": \"min\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 0.1,\n", + " \"upperValue\": 0.5\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary quantile 0.15 BTWN 2 and 4.3\",\n", + " \"firstField\": \"quantile\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 2.0,\n", + " \"upperValue\": 4.3\n", + " },\n", + " \"quantileValue\": 0.15,\n", + " \"verbose\": false\n", + " }\n", + "]\n", + "\n", + "Constraints for column 'col3': \n", + "[\n", + " {\n", + " \"name\": \"summary max LE 100/None\",\n", + " \"firstField\": \"max\",\n", + " \"value\": 100.0,\n", + " \"op\": \"LE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + "]\n", + "\n" + ] + } + ], + "source": [ + "# define the specific types of constraints\n", + "# the ranges of the between constraints include the bouding values\n", + "\n", + "# check if the maximum value of the column is in the range [5, 10.8]\n", + "max_between_values = maxBetweenConstraint(lower_value=5, upper_value=10.8) \n", + "# check if the maximum value of the column is less than or equal to 100\n", + "max_less_than_equal_value = maxLessThanEqualConstraint(value=100)\n", + "# check if the mean of the column is in the range [1.2, 1.6] \n", + "mean_between_values = meanBetweenConstraint(lower_value=1.2, upper_value=1.6)\n", + "# check if the minimum value of the column is in the range [0.1, 0.5]\n", + "min_between_values = minBetweenConstraint(lower_value=0.1, upper_value=0.5)\n", + "# check if the minimum value of the column is greater than or equal to 1\n", + "min_greater_than_equal_value = minGreaterThanEqualConstraint(value=1)\n", + "# check if the standard deviation of the column is in the range [2.3, 5.4]\n", + "stddev_between_values = stddevBetweenConstraint(lower_value=2.3, upper_value=5.4)\n", + "# check if the 0.15 quantile value is in the range [2, 4.3]\n", + "quantile_between_values = quantileBetweenConstraint(quantile_value = 0.15, lower_value=2, upper_value=4.3) \n", + "\n", + "# example data frame with columns \"col1\",\"col2\", \"col3\"\n", + "# you can also read an existing data set using pandas, or as a numpy array\n", + "df = pd.DataFrame({\n", + " \"col1\": [4, 5, 6, 7],\n", + " \"col2\": [0, 1, 2, 3],\n", + " \"col3\": [50, 60, 80, 110]\n", + "})\n", + "\n", + "# bind the standard deviation between constraint to the dataframe column named \"col1\"\n", + "# bind the mean between constraint to the dataframe column named \"col2\"\n", + "# you can add multiple summary constrants for each column\n", + "dc = DatasetConstraints(None, summary_constraints={\n", + " \"col1\": [max_between_values, stddev_between_values, min_greater_than_equal_value], \n", + " \"col2\": [mean_between_values, min_between_values, quantile_between_values],\n", + " \"col3\": [max_less_than_equal_value]\n", + "}) \n", + "\n", + "# logging the dataframe creates a profile wiht summary statistics for the data set\n", + "# the data set profile contains column profiles with summary statistics for each column present in the data set\n", + "profile = session.log_dataframe(df, \"test.data\", constraints=dc)\n", + "\n", + "# serialize the DatasetConstraints to JSON\n", + "dc_json = json.loads(dc.to_json())\n", + "col1_constraints = json.dumps(dc_json['summaryConstraints']['col1']['constraints'], indent=4)\n", + "col2_constraints = json.dumps(dc_json['summaryConstraints']['col2']['constraints'], indent=4)\n", + "col3_constraints = json.dumps(dc_json['summaryConstraints']['col3']['constraints'], indent=4)\n", + "\n", + "print(f\"Constraints for column 'col1': \\n{col1_constraints}\\n\")\n", + "print(f\"Constraints for column 'col2': \\n{col2_constraints}\\n\")\n", + "print(f\"Constraints for column 'col3': \\n{col3_constraints}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "21f3cc64", + "metadata": {}, + "source": [ + "#### Summary constraints are applied with apply_summary_constraints on the DatasetProfile." + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "5625be78", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "col1:\n", + " test_name total_run failed\n", + " summary max BTWN 5 and 10.8 1 0\n", + " summary stddev BTWN 2.3 and 5.4 1 1\n", + " summary min GE 1/None 1 0\n", + "col2:\n", + " test_name total_run failed\n", + " summary mean BTWN 1.2 and 1.6 1 0\n", + " summary min BTWN 0.1 and 0.5 1 1\n", + " summary quantile 0.15 BTWN 2 and 4.3 1 1\n", + "col3:\n", + " test_name total_run failed\n", + " summary max LE 100/None 1 1\n" + ] + } + ], + "source": [ + "# summary constraints must be applied on the dataset profile, after logging the dataframe\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "a42655a1", + "metadata": {}, + "source": [ + "As we can see **mean BTWN** passes and the **stddev BTWN** fails as they should." + ] + }, + { + "cell_type": "markdown", + "id": "037e5ee9", + "metadata": {}, + "source": [ + "## Summary constraints for distinct, unique and most common values in a column" + ] + }, + { + "cell_type": "markdown", + "id": "b7daf512", + "metadata": {}, + "source": [ + "### Distinct values in a column" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "10e1e6ac", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " distinctValuesInSetConstraint, distinctValuesEqualSetConstraint, distinctValuesContainSetConstraint )" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "id": "e2ae7cff", + "metadata": {}, + "outputs": [], + "source": [ + "in_set = distinctValuesInSetConstraint(reference_set=set(range(1, 10)))\n", + "eq_set = distinctValuesEqualSetConstraint(reference_set={'a', 'a', 'a'})\n", + "contain_set = distinctValuesContainSetConstraint(reference_set={0, 1})" + ] + }, + { + "cell_type": "markdown", + "id": "151629fd", + "metadata": {}, + "source": [ + "#### Applying summary constraints sent as an argument to apply_summary_constraints function on the same profile as before!" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "d5370632", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "col1:\n", + " test_name total_run failed\n", + " summary distinct_column_values IN_SET {1, 2, 3, 4, 5, 6, 7, 8, 9} 1 0\n", + " summary distinct_column_values EQ_SET {'a'} 1 1\n", + "col2:\n", + " test_name total_run failed\n", + " summary distinct_column_values CONTAIN_SET {0, 1} 1 0\n" + ] + } + ], + "source": [ + "report = profile.apply_summary_constraints({'col1': SummaryConstraints([in_set, eq_set]), \n", + " 'col2': SummaryConstraints([contain_set])})\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "dcbdd65c", + "metadata": {}, + "source": [ + "### Unique column value count and proportion constraints" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "221efa41", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " columnUniqueValueCountBetweenConstraint,\n", + " columnUniqueValueProportionBetweenConstraint,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e0b04634", + "metadata": {}, + "outputs": [], + "source": [ + "# create a data set with customers, the country they live in, and their spending\n", + "customer_data = pd.DataFrame({\n", + " \"customer\": [\"c1\", \"c2\", \"c3\", \"c4\", \"c5\", \"c6\"],\n", + " \"country\": [\"Germany\", \"Italy\", \"Germany\", \"USA\", \"Germany\", \"UK\"],\n", + " \"spending\": [1200, 500, 700, 1500, 300, None]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fc144d79", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "country:\n", + " test_name total_run failed\n", + " summary unique_count BTWN 1 and 5 1 0\n", + " summary unique_proportion BTWN 0.3 and 0.45 1 1\n" + ] + } + ], + "source": [ + "# check if there are between 1 and 5 unique values in the specific column\n", + "unique_value_count_between = columnUniqueValueCountBetweenConstraint(lower_value=1, upper_value=5)\n", + "# check if the proportion of unique values int he set is between 0.3 and 0.4 inclusive\n", + "unique_value_proportion_between = columnUniqueValueProportionBetweenConstraint(lower_fraction=0.3, upper_fraction=0.45)\n", + "dc = DatasetConstraints(None, summary_constraints={\"country\": [unique_value_count_between, unique_value_proportion_between]})\n", + "\n", + "# log the customer_data dataframe to obrain the profile\n", + "profile = session.log_dataframe(customer_data, 'test2.data', constraints=dc)\n", + "# summary constraints must be applied on the profile after the data set has been logged\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "89bad55b", + "metadata": {}, + "source": [ + "### Column most common value in set constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cf56143a", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnMostCommonValueInSetConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3f6f9c61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "country:\n", + " test_name total_run failed\n", + " summary most_common_value IN {'Germany', 'Italy'} 1 0\n" + ] + } + ], + "source": [ + "# check if the most common value in the column is in the set {\"Germany\", \"Italy\"}\n", + "most_common_value_in_set = columnMostCommonValueInSetConstraint(value_set={\"Germany\", \"Italy\"})\n", + "# bind the constraint to the column named \"country\"\n", + "summary_constraint = {\"country\": [most_common_value_in_set]}\n", + "# apply the summary constraints on the same profile for the customer_data data set\n", + "report = profile.apply_summary_constraints(summary_constraint)\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "b67689b0", + "metadata": {}, + "source": [ + "### Column values not null" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "10453a81", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesNotNullConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e1cbc8a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "customer:\n", + " test_name total_run failed\n", + " summary null_count EQ 0/None 1 0\n", + "spending:\n", + " test_name total_run failed\n", + " summary null_count EQ 0/None 1 1\n" + ] + } + ], + "source": [ + "# check if all values in the column are non-null\n", + "customer_value_not_null = columnValuesNotNullConstraint()\n", + "spending_value_not_null = columnValuesNotNullConstraint()\n", + "# bind the constraint to the column, there are no null values in the customer column, but there is one in the spending column\n", + "summary_constraint = {\"customer\": [customer_value_not_null], \"spending\": [spending_value_not_null]}\n", + "# apply the summary constraints on the same profile for the customer_data data set\n", + "report = profile.apply_summary_constraints(summary_constraint)\n", + "\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "851f72c4", + "metadata": {}, + "source": [ + "### Column value type equals or is in set constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "30e2f233", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " columnValuesTypeEqualsConstraint,\n", + " columnValuesTypeInSetConstraint\n", + ")\n", + "from whylogs.proto import InferredType" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f7a61d75", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "country:\n", + " test_name total_run failed\n", + " summary column_values_type EQ STRING 1 0\n", + "spending:\n", + " test_name total_run failed\n", + " summary column_values_type IN {'FRACTIONAL', 'INTEGRAL'} 1 0\n" + ] + } + ], + "source": [ + "# check if the values of the specified column are of type string\n", + "column_values_type_equals_string = columnValuesTypeEqualsConstraint(expected_type=InferredType.Type.STRING)\n", + "# check if the values of the specified column are either fractional or integral numbers\n", + "type_set = {InferredType.Type.FRACTIONAL, InferredType.Type.INTEGRAL}\n", + "column_value_types_in_set = columnValuesTypeInSetConstraint(type_set=type_set, verbose=True)\n", + "\n", + "column_type_summary_constraint = {\n", + " \"country\": [column_values_type_equals_string],\n", + " \"spending\": [column_value_types_in_set]\n", + "}\n", + "\n", + "# apply the summary constraints on the same profile for the customer_data data set\n", + "report = profile.apply_summary_constraints(column_type_summary_constraint)\n", + "# should not have failures since the country column type is string, and the spending column contains numbers\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "aaaaa2dd", + "metadata": {}, + "source": [ + "# Column values in set" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3c669679", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesInSetConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e7acd883", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "grade:\n", + " test_name total_run failed\n", + " value IN {'A', 'B', 'E', 'C', 'F'} 5 1\n" + ] + } + ], + "source": [ + "student_grades = pd.DataFrame({\n", + " 'student_id': [1, 5, 15, 16, 22],\n", + " 'grade': ['C', 'C', 'A', '/', 'B']\n", + "})\n", + "\n", + "val_set = {'A', 'B', 'C', 'E', 'F'} # valid grades\n", + "column_values_in_set = columnValuesInSetConstraint(value_set=val_set)\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\n", + " \"grade\": [column_values_in_set], \n", + "})\n", + "\n", + "# the value constraints are applied at the time of logging the dataframe\n", + "profile = session.log_dataframe(student_grades, \"test.data\", constraints=dc)\n", + "\n", + "# out of the five sutdent's grades we expect to see one failure for the '/' unknown grade\n", + "# the total number of runs of the constraint should equal the number of values in the column\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "id": "6a9cb3aa", + "metadata": {}, + "source": [ + "# Regex matching constraints" + ] + }, + { + "cell_type": "markdown", + "id": "4661890e", + "metadata": {}, + "source": [ + "### String length value constraints using regex" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "ea7d2164", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "str1:\n", + " test_name total_run failed\n", + " value MATCH ^.{7}$ 7 5\n", + " value MATCH ^.{7,10}$ 7 2\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import stringLengthEqualConstraint, stringLengthBetweenConstraint\n", + "df = pd.DataFrame(\n", + " [\n", + " {\"str1\": \"length7\"},\n", + " {\"str1\": \"length_8\"},\n", + " {\"str1\": \"length__9\"},\n", + " {\"str1\": \"a 10\"},\n", + " {\"str1\": \"11 b\"},\n", + " {\"str1\": '(*&^%^&*(24!@_+>:|}?><\"\\\\'},\n", + " {\"str1\": \"1b34567\"},\n", + " ]\n", + ")\n", + "length_constraint7 = stringLengthEqualConstraint(length=7)\n", + "length_constraint7to10 = stringLengthBetweenConstraint(lower_value=7, upper_value=10)\n", + "length_constraints = [length_constraint7, length_constraint7to10]\n", + "dc = DatasetConstraints(None, value_constraints={\"str1\": length_constraints})\n", + "\n", + "profile = session.log_dataframe(df, 'test2.data', constraints=dc)\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "id": "f46a455e", + "metadata": {}, + "source": [ + "### Email matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "b679674c", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsEmailConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "041bd248", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "email:\n", + "\n", + "test_name:\tvalue MATCH ^(?i)(?:[a-z0-9!#$%&\\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\\'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)$\n", + "\n", + "total_run:\t8\n", + "\n", + "failed:\t\t4\n", + "\n" + ] + } + ], + "source": [ + "customer_emails = pd.DataFrame([\n", + " {\"email\": r\"abc's@gmail.com\"}, # valid\n", + " {\"email\": r'\"aVrrR Test \\@\"@gmail.com'}, # valid (if wrapped in quotes, emails can contain special characters)\n", + " {\"email\": r\"abc..q12@example.us\"}, # invalid (two consecutive dots)\n", + " {\"email\": r'\"sdsss\\d\"@gmail.com'}, # valid\n", + " {\"email\": r\"customer/department=shipping?@example-another.some-other.us\"}, # valid\n", + " {\"email\": r\".should_fail@yahoo.com\"}, # invalid (must not start wiht dot)\n", + " {\"email\": r\"some.@a.com\"}, # invalid (must not contain a dot directly before the @ symbol)\n", + " {\"email\": r\"abs@yahoo.\"}, # invalid (must not end with a dot)\n", + "])\n", + "\n", + "# use the predefined email regex from whylogs\n", + "default_contains_email_constraint = containsEmailConstraint()\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\"email\": [default_contains_email_constraint]})\n", + "\n", + "profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)\n", + "# we expect 4 of the 8 runs to be failures\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "4d4f02e3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsEmailConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "email:\n", + " test_name total_run failed\n", + " value MATCH \\S+@\\S+ 8 1\n" + ] + } + ], + "source": [ + "# you can provide your own email regex and check the values against it\n", + "custom_contains_email_constraint = containsEmailConstraint(regex_pattern = r\"\\S+@\\S+\")\n", + "dc = DatasetConstraints(None, value_constraints={\"email\": [custom_contains_email_constraint]})\n", + "\n", + "profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)\n", + "# now we expect 1 of the 8 runs to be failures, the email that contains white spaces\n", + "format_report(dc.report())\n", + "# running the containsEmailConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "c959c36e", + "metadata": {}, + "source": [ + "### Credit Card matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "73901092", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsCreditCardConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "3f0d7e8f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "credit_card:\n", + "\n", + "test_name:\tvalue MATCH ^(?:(4[0-9]{3}([\\s-]?[0-9]{4}){2}[\\s-]?[0-9]{1,4})|(?:(5[1-5][0-9]{2}([\\s-]?[0-9]{4}){3}))|(?:(6(?:011|5[0-9]{2})([\\s-]?[0-9]{4}){3}))|(?:(3[47][0-9]{2}[\\s-]?[0-9]{6}[\\s-]?[0-9]{5}))|(?:(3(?:0[0-5]|[68][0-9])[0-9][\\s-]?[0-9]{6}[\\s-]?[0-9]{4}))|(?:2131|1800|35[0-9]{2,3}([\\s-]?[0-9]{4}){3}))$\n", + "\n", + "total_run:\t19\n", + "\n", + "failed:\t\t5\n", + "\n" + ] + } + ], + "source": [ + "credit_cards = pd.DataFrame(\n", + " [\n", + " {\"credit_card\": \"3714-496353-98431\"}, # amex\n", + " {\"credit_card\": \"3787 344936 71000\"}, # amex\n", + " {\"credit_card\": \"3056 930902 5904\"}, # diners club\n", + " {\"credit_card\": \"3065 133242 2899\"}, # invalid\n", + " {\"credit_card\": \"3852-000002-3237\"}, # diners club\n", + " {\"credit_card\": \"6011 1111 1111 1117\"}, # discover\n", + " {\"credit_card\": \"6011-0009-9013-9424\"}, # discover\n", + " {\"credit_card\": \"3530 1113 3330 0000\"}, # jcb\n", + " {\"credit_card\": \"3566-0020-2036-0505\"}, # jcb\n", + " {\"credit_card\": \"5555 5555 5555 4444\"}, # master card\n", + " {\"credit_card\": \"5105 1051 0510 5100\"}, # master card\n", + " {\"credit_card\": \"4111 1111 1111 1111\"}, # visa\n", + " {\"credit_card\": \"4012 8888 8888 1881\"}, # visa\n", + " {\"credit_card\": \"4222-2222-2222-2222\"}, # visa\n", + " {\"credit_card\": \"1111-1111-1111-1111\"}, # invalid\n", + " {\"credit_card\": \"a4111 1111 1111 1111b\"}, # invalid\n", + " {\"credit_card\": \"4111111111111111\"}, # visa\n", + " {\"credit_card\": 12345}, # invalid\n", + " {\"credit_card\": \"absfcvs\"}, # invalid\n", + " ]\n", + ")\n", + "\n", + "default_credit_card_constraint = containsCreditCardConstraint()\n", + "dc = DatasetConstraints(None, value_constraints={\"credit_card\": [default_credit_card_constraint]})\n", + "\n", + "profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)\n", + "# now we expect 5 of the 19 runs to be failures, the invalid credit cards\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "7ce86172", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsCreditCardConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "credit_card:\n", + " test_name total_run failed\n", + " value MATCH ^(?:[0-9]{4}[\\s-]?){3,4}$ 19 8\n" + ] + } + ], + "source": [ + "# you can provide your own credit card regex and check the values against it\n", + "custom_credit_card_constraint = containsCreditCardConstraint(regex_pattern = r\"^(?:[0-9]{4}[\\s-]?){3,4}$\")\n", + "dc = DatasetConstraints(None, value_constraints={\"credit_card\": [custom_credit_card_constraint]})\n", + "\n", + "profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)\n", + "# now more valid credit cards are being reported as failures\n", + "format_report(dc.report())\n", + "# running the containsCreditCardConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "15354468", + "metadata": {}, + "source": [ + "### SSN regex matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "6e475de8", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsSSNConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "9d956856", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "ssn:\n", + "\n", + "test_name:\tvalue MATCH ^(?!000|666|9[0-9]{2})[0-9]{3}[\\s-]?(?!00)[0-9]{2}[\\s-]?(?!0000)[0-9]{4}$\n", + "\n", + "total_run:\t8\n", + "\n", + "failed:\t\t4\n", + "\n" + ] + } + ], + "source": [ + "ssn_data = pd.DataFrame([\n", + " {\"ssn\": \"123-01-2335\"}, # valid\n", + " {\"ssn\": \"039780012\"}, # valid\n", + " {\"ssn\": \"000231324\"}, # invalid\n", + " {\"ssn\": \"666781132\"}, # invalid\n", + " {\"ssn\": \"926-89-1234\"}, # invalid\n", + " {\"ssn\": \"001-01-0001\"}, # valid\n", + " {\"ssn\": \"122 23 0001\"}, # valid\n", + " {\"ssn\": \"1234-12-123\"}, # invalid\n", + "])\n", + "\n", + "default_ssn_constraint = containsSSNConstraint()\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\"ssn\": [default_ssn_constraint]})\n", + "\n", + "profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)\n", + "# now we expect 4 of the 8 runs to be failures, the invalid ssn numbers\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "d7cf8fe8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsSSNConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "ssn:\n", + " test_name total_run failed\n", + " value MATCH ^[0-9]{3}-[0-9]{2}-[0-9]{4}$ 8 5\n" + ] + } + ], + "source": [ + "# you can provide your own ssn regex and check the values against it\n", + "custom_ssn_constraint = containsSSNConstraint(regex_pattern = r\"^[0-9]{3}-[0-9]{2}-[0-9]{4}$\")\n", + "dc = DatasetConstraints(None, value_constraints={\"ssn\": [custom_ssn_constraint]})\n", + "\n", + "profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)\n", + "# now more valid ssn numbers are being reported as failures\n", + "format_report(dc.report())\n", + "# running the containsSSNConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "514241b0", + "metadata": {}, + "source": [ + "### URL regex matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "9b6b8257", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsURLConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "52460643", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "url:\n", + "\n", + "test_name:\tvalue MATCH ^(?:http(s)?:\\/\\/)?((www)|(?:[a-zA-z0-9-]+)\\.)(?:[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.(?:[a-zA-Z0-9]{1,6})\\b(?:[-a-zA-Z0-9@:%_\\+.~#?&//=]*))$\n", + "\n", + "total_run:\t10\n", + "\n", + "failed:\t\t4\n", + "\n" + ] + } + ], + "source": [ + "web_urls = pd.DataFrame([\n", + " {\"url\": \"http://www.example.com\"}, # valid\n", + " {\"url\": \"abc.test.com\"}, # valid (without protocol)\n", + " {\"url\": \"abc.w23w.asb#abc?a=2\"}, # valid (without protocol)\n", + " {\"url\": \"https://ab.abc.bc\"}, # valid\n", + " {\"url\": \"a.b.c\"}, # valid\n", + " {\"url\": \"abcd\"}, # invalid\n", + " {\"url\": \"123.w23.235\"}, # valid\n", + " {\"url\": \"asf://saf.we.12\"}, # invalid\n", + " {\"url\": \"12345\"}, # invalid\n", + " {\"url\": \"1.2\"}, # invalid\n", + " \n", + "])\n", + "\n", + "default_url_constraint = containsURLConstraint()\n", + "dc = DatasetConstraints(None, value_constraints={\"url\": [default_url_constraint]})\n", + "\n", + "profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)\n", + "# now we expect the 4 invalid urls, out of the 10 in total, to be reported as failures\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "d1bfc094", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsURLConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "url:\n", + " test_name total_run failed\n", + " value MATCH ^http(s)?:\\/\\/(www\\.)?.+\\..+$ 10 8\n" + ] + } + ], + "source": [ + "# you can provide your own ur; regex and check the values against it\n", + "custom_url_constraint = containsURLConstraint(regex_pattern = r\"^http(s)?:\\/\\/(www\\.)?.+\\..+$\")\n", + "dc = DatasetConstraints(None, value_constraints={\"url\": [custom_url_constraint]})\n", + "\n", + "profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)\n", + "# with the new regex more valid urls are being reported as failures\n", + "format_report(dc.report())\n", + "# running the containsURLConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "eff98762", + "metadata": {}, + "source": [ + "# Datetime/json constraints" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a9fdc2ef", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "str1:\n", + " test_name total_run failed\n", + " value APPLY_FUNC _try_parse_dateutil 14 9\n", + " value APPLY_FUNC _try_parse_json 14 12\n", + " value APPLY_FUNC _matches_json_schema 14 12\n", + " value APPLY_FUNC _try_parse_strftime_format 14 12\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " dateUtilParseableConstraint, jsonParseableConstraint, matchesJsonSchemaConstraint, strftimeFormatConstraint )\n", + "df = pd.DataFrame(\n", + " [\n", + " {\"str1\": \"1990-12-1\"}, # dateutil valid; strftime valid\n", + " {\"str1\": \"1990/12/1\"},\n", + " {\"str1\": \"today is 2019-03-27\"}, # dateutil invalid\n", + " {\"str1\": \"Monday at 12:01am\"},\n", + " {\"str1\": \"xyz_not_a_date\"}, # dateutil invalid\n", + " {\"str1\": \"yesterday\"}, # dateutil invalid\n", + " {\"str1\": {\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232, \"abc\": 1}}, # schema valid\n", + " {\"str1\": {\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232}}, # schema invalid\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232, \"abc\": 1})}, # json valid, schema valid\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": \"232\", \"abc\": 1})}, # json valid\n", + " {\"str1\": \"random str : fail everything\"},\n", + " {\"str1\": \"2003-12-23\"}, # strftime valid, dateutil valid\n", + " {\"str1\": \"2003-15-23\"}, # strftime invalid, dateutil invalid\n", + " {\"str1\": \"10-12-32\"}, # strftime invalid, dateutil valid\n", + " ]\n", + " )\n", + "\n", + "dateutil_parseable = dateUtilParseableConstraint()\n", + "json_parseable = jsonParseableConstraint()\n", + "\n", + "json_schema = {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\"},\n", + " \"years\": {\"type\": \"integer\"},\n", + " },\n", + " \"required\": [\"name\", \"abc\"],\n", + " }\n", + "matches_json_schema = matchesJsonSchemaConstraint(json_schema=json_schema)\n", + "\n", + "is_strftime = strftimeFormatConstraint(format=\"%Y-%m-%d\")\n", + "\n", + "apply_func_constraints = [dateutil_parseable, json_parseable, matches_json_schema, is_strftime]\n", + "\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\"str1\": apply_func_constraints})\n", + "profile = session.log_dataframe(df, 'test3.data', constraints=dc)\n", + "\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "id": "16af2279", + "metadata": {}, + "source": [ + "Seeing the comments above, when creating the dataset, we can realize which values fail or pass, for which constraint. The dateutil constraint has 5 passing values in the dataset, and the other 3 constraints have only 2 values that pass from total of 14." + ] + }, + { + "cell_type": "markdown", + "id": "dd89c8d7", + "metadata": {}, + "source": [ + "# Entropy and Distributional Measures" + ] + }, + { + "cell_type": "markdown", + "id": "b31901ff", + "metadata": {}, + "source": [ + "### Entropy" + ] + }, + { + "cell_type": "markdown", + "id": "426903d3", + "metadata": {}, + "source": [ + "Check if the colmn entropy is in some interval [a, b]. Works both for discrete and continuous valued columns." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a879dea4", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import approximateEntropyBetweenConstraint" + ] + }, + { + "cell_type": "markdown", + "id": "0d102965", + "metadata": {}, + "source": [ + "#### Entropy on categorical data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "daa642d5", + "metadata": {}, + "outputs": [], + "source": [ + "pets = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.3, 0.1, 0.2, 0.4])\n", + "pet_df = pd.DataFrame({\n", + " \"pet\": pets\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c4cb1421", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary entropy BTWN 0.7 and 2.1 1 0\n" + ] + } + ], + "source": [ + "# check if the entropy of the pet_df 'pet' column is between 0.7 and 2.1 (the actual value is around 1.85)\n", + "entropy_between_values_constraint = approximateEntropyBetweenConstraint(lower_value=0.7, upper_value=2.1)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [entropy_between_values_constraint]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to complete without failures\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "5b8171ca", + "metadata": {}, + "source": [ + "#### Entropy on continuous data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "55e59691", + "metadata": {}, + "outputs": [], + "source": [ + "# sample 100 data points from normal ditribution with mean 30000 and standard deviation 15000 to represent sales values\n", + "sales = np.random.normal(loc=30000, scale=15000, size=100)\n", + "\n", + "sales_df = pd.DataFrame({\n", + " \"sales\": sales\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "931a2585", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "sales:\n", + " test_name total_run failed\n", + " summary entropy BTWN 2.3 and 3.5 1 1\n" + ] + } + ], + "source": [ + "# check if the entropy of the sales_df 'sales' column is between 2.3 and 3.5 (the actual value is 1.85)\n", + "entropy_between_values_constraint_cont = approximateEntropyBetweenConstraint(lower_value=2.3, upper_value=3.5)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"sales\": [entropy_between_values_constraint_cont]})\n", + "\n", + "profile = session.log_dataframe(sales_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint fail since entropy is between 3.8 and 3.9\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "020e9ae2", + "metadata": {}, + "source": [ + "### KS Test" + ] + }, + { + "cell_type": "markdown", + "id": "c3b4dcd2", + "metadata": {}, + "source": [ + "The KS Test can only be executed on continuous data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c53fdd26", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import parametrizedKSTestPValueGreaterThanConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "48251fb0", + "metadata": {}, + "outputs": [], + "source": [ + "# this would be the reference distribution, sales 2020\n", + "sales_2020 = np.random.normal(loc=30000, scale=15000, size=100)\n", + "# this would be the target distribution, sales 2021\n", + "sales_2021 = np.random.normal(loc=45000, scale=10000, size=100)\n", + "# we want to check if the sales in 2020 have the same distribution as the sales in 2021" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2d265980", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "sales:\n", + " test_name total_run failed\n", + " summary ks_test p-value GT 0.05 1 1\n" + ] + } + ], + "source": [ + "sales_2021_df = pd.DataFrame({\n", + " \"sales\": sales_2021\n", + "})\n", + "\n", + "# check if the p-value of the ks test for refrenece distribution sales_2020 is greater than 0.05 \n", + "# if so, we do not reject the null hypothesis\n", + "ks_test_p_value_greater_than = parametrizedKSTestPValueGreaterThanConstraint(reference_distribution=sales_2020, p_value=0.05)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"sales\": [ks_test_p_value_greater_than]})\n", + "\n", + "profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail since entropy is between 3.8 and 3.9\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "1958febe", + "metadata": {}, + "source": [ + "The p-value is less than 0.05, which means we can reject the null hypothesis with this confidence level." + ] + }, + { + "cell_type": "markdown", + "id": "9ff32245", + "metadata": {}, + "source": [ + "### KL Divergence" + ] + }, + { + "cell_type": "markdown", + "id": "a3d9344e", + "metadata": {}, + "source": [ + "The KL Divergence constraint is supported for both discrete and continuous variables." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c729bb38", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnKLDivergenceLessThanConstraint" + ] + }, + { + "cell_type": "markdown", + "id": "2300513e", + "metadata": {}, + "source": [ + "#### KL Divergence for continuous case" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2d865e39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "sales:\n", + " test_name total_run failed\n", + " summary kl_divergence threshold LT 0.6 1 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: divide by zero encountered in true_divide\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n", + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: invalid value encountered in true_divide\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n", + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: divide by zero encountered in log\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n", + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: invalid value encountered in multiply\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n" + ] + } + ], + "source": [ + "# check if the kl divergence is greater than 0.6 \n", + "kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=sales_2020, threshold=0.6)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"sales\": [kl_divergence_greater_than]})\n", + "\n", + "profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "55ba531d", + "metadata": {}, + "source": [ + "The distribution of sales in 2020 cannot be encoded with the distribution of sales in 2021." + ] + }, + { + "cell_type": "markdown", + "id": "4ec0d8f1", + "metadata": {}, + "source": [ + "#### KL Divergence for discrete case" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5e22e532", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary kl_divergence threshold LT 0.6 1 0\n" + ] + } + ], + "source": [ + "# create a new distribtution from the pets sample with different probabilities\n", + "pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.5, 0.1, 0.2, 0.2])\n", + "\n", + "# check if the kl divergence is greater than 0.6 \n", + "kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=pets_reference, threshold=0.6)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [kl_divergence_greater_than]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to not fail\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "ba3abc84", + "metadata": {}, + "source": [ + "### Chi-Squared Test" + ] + }, + { + "cell_type": "markdown", + "id": "ef4c2790", + "metadata": {}, + "source": [ + "The Chi-Squared test constraint is only supported for categorical values." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "efa2de2a", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnChiSquaredTestPValueGreaterThanConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "cbde8291", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary chi_squared_test p-value GT 0.05 1 1\n" + ] + } + ], + "source": [ + "# create a new distribtution from the pets sample with different probabilities\n", + "pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.01, 0.01, 0.97, 0.01])\n", + "\n", + "# check if the p-value is greater than 0.05\n", + "chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=pets_reference, p_value=0.05)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [chi_squared_p_value_greater_than]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail since the distributions are different\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "14fa9046", + "metadata": {}, + "source": [ + "The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval." + ] + }, + { + "cell_type": "markdown", + "id": "0b589597", + "metadata": {}, + "source": [ + "If you don't have a reference distribution for calculating the Chi-Squared test, but you know the approximate frequencies of each of the items, you can use this constraint by supplying a mapping of items and frequencies as counts, in the reference distribution parameter of the constraint." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "b4f105f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary chi_squared_test p-value GT 0.05 1 1\n" + ] + } + ], + "source": [ + "# create a new distribtution from the pets sample with different probabilities\n", + "reference_dict_pets = {\n", + " 'cat': 1,\n", + " 'dog': 1,\n", + " 'rabbit': 48, \n", + " 'hamster': 1,\n", + "}\n", + "\n", + "# check if the p_value is greater than 0.05\n", + "chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=reference_dict_pets, p_value=0.05)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [chi_squared_p_value_greater_than]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail since this is approximately the same distribution from the previous example\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "78607967", + "metadata": {}, + "source": [ + "The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval." + ] + }, + { + "cell_type": "markdown", + "id": "a4113311", + "metadata": {}, + "source": [ + "## Table shape constraints" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "cceadc2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 1 1\n", + " table total_row_number EQ 14 1 0\n", + " table columns CONTAIN this_column_does_not_exist 1 1\n", + " table columns CONTAIN col2 1 0\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 1 1\n", + " table columns EQ {'str1', 'col2'} 1 0\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " numberOfRowsConstraint, columnExistsConstraint, columnsMatchSetConstraint )\n", + "\n", + "df = pd.DataFrame(\n", + " [\n", + " {\"str1\": \"random1\"},\n", + " {\"str1\": \"random2\"},\n", + " {\"str1\": \"random 4-1\"},\n", + " {\"str1\": \"4 random\"},\n", + " {\"str1\": \"whylogs rocks!\"},\n", + " {\"str1\": \" \"},\n", + " {\"str1\": 12},\n", + " {\"str1\": {\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232}},\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232, \"abc\": 1})},\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": \"232\", \"abc\": 1})},\n", + " {\"str1\": \"random str : fail everything\"},\n", + " {\"str1\": \"2003-12-23\"},\n", + " {\"str1\": \"2003-15-23\"},\n", + " {\"str1\": \"10-12-32\"},\n", + " ]\n", + " )\n", + "\n", + "df['col2'] = range(len(df))\n", + "\n", + "rows = numberOfRowsConstraint(n_rows=len(df)+1) # fail\n", + "rows_2 = numberOfRowsConstraint(n_rows=len(df)) # pass\n", + "\n", + "column_exist = columnExistsConstraint(\"this_column_does_not_exist\") # fail\n", + "column_exist2 = columnExistsConstraint(\"col2\") # pass\n", + "\n", + "set1 = {'this', 'is', 'a', 'wrong', 'columns', 'set'}\n", + "columns_set = set(df.columns)\n", + "columns_match = columnsMatchSetConstraint(set1) # fail\n", + "columns_match2 = columnsMatchSetConstraint(columns_set) # pass\n", + "\n", + "table_shape_constraints = [rows, rows_2, column_exist, column_exist2, columns_match, columns_match2]\n", + "\n", + "dc = DatasetConstraints(None, table_shape_constraints=table_shape_constraints)\n", + "\n", + "profile = session.log_dataframe(df, \"test.data\", constraints=dc)\n", + "\n", + "report = profile.apply_table_shape_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "1c44652a", + "metadata": {}, + "source": [ + "### Table shape example 2" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "339bdec3", + "metadata": {}, + "outputs": [], + "source": [ + "logger = session.logger(dataset_name=\"test2.data\", constraints=dc)\n", + "logger.log_dataframe(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "56629731", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 2 2\n", + " table total_row_number EQ 14 2 0\n", + " table columns CONTAIN this_column_does_not_exist 2 2\n", + " table columns CONTAIN col2 2 0\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 2 2\n", + " table columns EQ {'str1', 'col2'} 2 0\n" + ] + } + ], + "source": [ + "report = logger.profile.apply_table_shape_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "39f56f39", + "metadata": {}, + "source": [ + "Logging another dataframe with different DatasetProfile but the same DatasetConstraints, just an example" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "6f510f16", + "metadata": {}, + "outputs": [], + "source": [ + "logger.log({\"this_column_does_not_exist\": 1}) # logging a new non existent column" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "63280b15", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 3 3\n", + " table total_row_number EQ 14 3 0\n", + " table columns CONTAIN this_column_does_not_exist 3 2\n", + " table columns CONTAIN col2 3 0\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 3 3\n", + " table columns EQ {'str1', 'col2'} 3 1\n" + ] + } + ], + "source": [ + "report2 = logger.profile.apply_table_shape_constraints()\n", + "format_report(report2)" + ] + }, + { + "cell_type": "markdown", + "id": "78904486", + "metadata": {}, + "source": [ + "After logging the column 'this_column_does_not_exist', the total row number stays the same, \n", + "so the numberOfRowsConstraint passed.\n", + "\n", + "**'table columns CONTAIN this_column_does_not_exist'** constraint now passed, since the column now exists, but\n", + "\n", + "**'table columns EQ {'str1', 'col2'}'** now failed, because new column was logged\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "e2c41e44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table columns EQ {'str1', 'col2', 'this_column_does_not_exist'} 1 0\n" + ] + } + ], + "source": [ + "set2 = set(columns_set)\n", + "set2.add(\"this_column_does_not_exist\")\n", + "\n", + "columns_match3 = columnsMatchSetConstraint(set2) # new constraint containing the new column\n", + "\n", + "report3 = logger.profile.apply_table_shape_constraints(SummaryConstraints([columns_match3])) # applying just the new constraint\n", + "format_report(report3)" + ] + }, + { + "cell_type": "markdown", + "id": "e45f346c", + "metadata": {}, + "source": [ + "After adding the new column to **'set2'** and creating a **columnsMatchSetConstraint** with it, now it doesn't fail" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "688f2478", + "metadata": {}, + "outputs": [], + "source": [ + "log_dict = dict()\n", + " # logging a new value for every column (one more row)\n", + "for column in df.columns:\n", + " value = df[column][10] # sample from the column\n", + " log_dict[column] = value\n", + "logger.log(log_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "668c93c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 4 3\n", + " table total_row_number EQ 14 4 1\n", + " table columns CONTAIN this_column_does_not_exist 4 2\n", + " table columns CONTAIN col2 4 0\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 4 4\n", + " table columns EQ {'str1', 'col2'} 4 2\n" + ] + } + ], + "source": [ + "report4 = logger.profile.apply_table_shape_constraints()\n", + "format_report(report4)" + ] + }, + { + "cell_type": "markdown", + "id": "f3f3f878", + "metadata": {}, + "source": [ + "**'table total_row_number EQ 14'** now failed since new row was logged" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "a0ba26b5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 1 0\n" + ] + } + ], + "source": [ + "rows_3 = numberOfRowsConstraint(n_rows=len(df.index) + 1) # new numberOfRowsConstraint\n", + "report5 = logger.profile.apply_table_shape_constraints(SummaryConstraints([rows_3]))\n", + "format_report(report5)" + ] + }, + { + "cell_type": "markdown", + "id": "b48a12eb", + "metadata": {}, + "source": [ + "Creating a new **numberOfRowsConstraint** with n_rows = previous_n_rows + 1 and applying it, now passed." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "e6058796", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15\n" + ] + } + ], + "source": [ + "profile = logger.close() # closing the logger and getting the DatasetProfile\n", + "print (profile.total_row_number)" + ] + }, + { + "cell_type": "markdown", + "id": "1956cf2b", + "metadata": {}, + "source": [ + "## Multi column constraints\n", + "### Logical operations between values of the specified columns" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "0ae20c1d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + " test_name total_run failed\n", + " multi column value col1 GT col2 4 2\n", + " multi column value col1 EQ col2 4 3\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesAGreaterThanBConstraint, columnValuesAEqualBConstraint\n", + "\n", + "df = pd.DataFrame({\"col1\": [4, 5, 6, 7], \"col2\": [0, 1, 6, 15]})\n", + "\n", + "a_gt_b = columnValuesAGreaterThanBConstraint(column_A=\"col1\", column_B=\"col2\")\n", + "a_eq_b = columnValuesAEqualBConstraint(column_A=\"col1\", column_B=\"col2\")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[a_gt_b, a_eq_b])\n", + "\n", + "profile = session.log_dataframe(df, \"test4.data\", constraints=dc)\n", + "\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "id": "76062557", + "metadata": {}, + "source": [ + "Value by value comparison. col1 values > col2 values, only 2 are passing, and col1 values == col 2 values only 1 is True (the third element from both the columns are equal)." + ] + }, + { + "cell_type": "markdown", + "id": "f744614c", + "metadata": {}, + "source": [ + "### Sum of row values of multiple columns equals some value, or some column value" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ae986496", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import sumOfRowValuesOfMultipleColumnsEqualsConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "b5a8a4a2", + "metadata": {}, + "outputs": [], + "source": [ + "total_expences = pd.DataFrame({\n", + " \"employees %\": [25, 45, 15, 3],\n", + " \"equipment %\": [10, 12, 4, 9],\n", + " \"materials %\": [40, 35, 45, 55],\n", + " \"other %\": [25, 8, 4, 6]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "62430c24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + "\n", + "test_name:\tmulti column value SUM ['employees %', 'equipment %', 'materials %', 'other %'] EQ 100\n", + "\n", + "total_run:\t4\n", + "\n", + "failed:\t\t2\n", + "\n" + ] + } + ], + "source": [ + "# check if the percentage of expences for each part sum to 100 %\n", + "sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(\n", + " columns=[\"employees %\", \"equipment %\", \"materials %\", \"other %\"],\n", + " value=100\n", + ")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])\n", + "\n", + "# the multicolumn value constraints do not need to be applied to the data \n", + "# the are applied at the time of logging\n", + "profile = session.log_dataframe(total_expences, \"test.data\", constraints=dc)\n", + "\n", + "# we expect 2 of the 4 rows to be failures since the last two rows do not sum to 100\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "9d1b7812", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + " test_name total_run failed\n", + " multi column value SUM ['equipment %', 'materials %'] EQ ['other %'] 4 4\n" + ] + } + ], + "source": [ + "# check if the sum of the row values (percentages) for 'equipment %' and 'materials %' equalt the value of 'other %'\n", + "sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(\n", + " columns=[\"equipment %\", \"materials %\"],\n", + " value='other %'\n", + ")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])\n", + "profile = session.log_dataframe(total_expences, \"test.data\", constraints=dc)\n", + "\n", + "# we expect all rows to be failures since the sum of 'equipment %' and 'materials %' is not equal to the value of the column 'other %'\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "id": "bbbc748f", + "metadata": {}, + "source": [ + "### Column Pair Values in Set" + ] + }, + { + "cell_type": "markdown", + "id": "fa997a38", + "metadata": {}, + "source": [ + "Check if the values of a pair of columns are in a predefined set of pair values." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "c7cbd1e0", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnPairValuesInSetConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "9c679454", + "metadata": {}, + "outputs": [], + "source": [ + "product_grades = pd.DataFrame({\n", + " \"product\": [\"ProductA\", \"ProductB\", \"ProductC\", \"ProductD\", \"ProductE\"],\n", + " \"grade\": [\"A\", \"A\", \"B\", \"C\", \"C\"],\n", + " \"subgrade\": [\"A1\", \"A3\", \"B2\", \"C2\", \"C2\"]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "b6db3720", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + "\n", + "test_name:\tmulti column value ['grade', 'subgrade'] IN {('C', 'C2'), ('B', 'B2'), ('B', 'B1'), ('A', 'A1'), ('A', 'A2'), ('C', 'C1')}\n", + "\n", + "total_run:\t5\n", + "\n", + "failed:\t\t1\n", + "\n" + ] + } + ], + "source": [ + "# we want to check if each of the grade and subgrade pairs are in the specific set\n", + "grade_subgrade_pairs_in_set = columnPairValuesInSetConstraint(\n", + " column_A=\"grade\", \n", + " column_B=\"subgrade\",\n", + " value_set = {(\"A\", \"A1\"), (\"A\", \"A2\"), (\"B\", \"B1\"), (\"B\", \"B2\"), (\"C\", \"C1\"), (\"C\", \"C2\")}\n", + ")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[grade_subgrade_pairs_in_set])\n", + "profile = session.log_dataframe(product_grades, \"test.data\", constraints=dc)\n", + "\n", + "# we expect 1 out of 5 pairs to be a failure, specifically (\"A\", \"A3\")\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "id": "72a21ad0", + "metadata": {}, + "source": [ + "### Column Values Unique within Row" + ] + }, + { + "cell_type": "markdown", + "id": "cf52c403", + "metadata": {}, + "source": [ + "Check if the value of the specified column is unique within each row." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "42ef3e02", + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesUniqueWithinRow" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "db57da2c", + "metadata": {}, + "outputs": [], + "source": [ + "users = pd.DataFrame({\n", + " \"first_name\": [\"John\", \"Jane\", \"Bob\", \"Anna\"],\n", + " \"last_name\": [\"Doe\", \"Doe\", \"Smith\", \"Jones\"],\n", + " \"username\": [\"jd123\", \"jane.doe@example.com\", \"bobsmith\", \"_anna_\"],\n", + " \"email\": [\"john.doe@example.com\", \"jane.doe@example.com\", \"bob.smith@example.com\", \"anna_jones@example.com\"],\n", + " \"followers\": [1525, 12268, 51343, 867],\n", + " \"points\": [23.4, 123.2, 432.22, 32.1],\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "d78fc0a4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + " test_name total_run failed\n", + " multi column value email NOT_IN all 4 1\n" + ] + } + ], + "source": [ + "# check if the emails are unique compared to other fields for each user\n", + "# suppose we do not want to accept a username which is the same as the user's email\n", + "email_values_unique_within_row = columnValuesUniqueWithinRow(column_A=\"email\")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[email_values_unique_within_row])\n", + "profile = session.log_dataframe(users, \"test.data\", constraints=dc)\n", + "\n", + "# we expect 1 out of 4 evaluations of the constraint to be a failure, sicne Jane Doe's email is the same as their username\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "id": "a7da9d24", + "metadata": {}, + "source": [ + "# Generate default constraints for data set" + ] + }, + { + "cell_type": "markdown", + "id": "a88c6ede", + "metadata": {}, + "source": [ + "Let's log the users data frame from the previous example, without any constraints. We will use WhyLogs' generate_constraints method to generate default constraints using the dataset profile." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b938730f", + "metadata": {}, + "outputs": [], + "source": [ + "profile = session.log_dataframe(users, \"test.data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a552cdce", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"properties\": {\n", + " \"schemaMajorVersion\": 1,\n", + " \"schemaMinorVersion\": 2,\n", + " \"sessionId\": \"8222b610-9472-4bfb-92f5-a56a49cd8199\",\n", + " \"sessionTimestamp\": \"1643116248232\",\n", + " \"dataTimestamp\": \"1643112751681\",\n", + " \"tags\": {\n", + " \"name\": \"test.data\"\n", + " },\n", + " \"metadata\": {}\n", + " },\n", + " \"summaryConstraints\": {\n", + " \"first_name\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'Bob', 'Anna', 'John', 'Jane'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"Bob\",\n", + " \"Anna\",\n", + " \"John\",\n", + " \"Jane\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"followers\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary min GE 0/None\",\n", + " \"firstField\": \"min\",\n", + " \"value\": 0.0,\n", + " \"op\": \"GE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary mean BTWN -7308.11238882488 and 40309.612388824884\",\n", + " \"firstField\": \"mean\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": -7308.11238882488,\n", + " \"upperValue\": 40309.612388824884\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary column_values_type EQ INTEGRAL\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 3.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'51343', '867', '1525', '12268'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"51343\",\n", + " \"867\",\n", + " \"1525\",\n", + " \"12268\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"last_name\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 2 and 4\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 2.0,\n", + " \"upperValue\": 4.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'Jones', 'Doe', 'Smith'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"Jones\",\n", + " \"Doe\",\n", + " \"Smith\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"email\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'bob.smith@example.com', 'john.doe@example.com', 'jane.doe@example.com', 'anna_jones@example.com'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"bob.smith@example.com\",\n", + " \"john.doe@example.com\",\n", + " \"jane.doe@example.com\",\n", + " \"anna_jones@example.com\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"points\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary min GE 0/None\",\n", + " \"firstField\": \"min\",\n", + " \"value\": 0.0,\n", + " \"op\": \"GE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary mean BTWN -38.98552432358383 and 344.44552432358387\",\n", + " \"firstField\": \"mean\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": -38.98552432358383,\n", + " \"upperValue\": 344.44552432358387\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary column_values_type EQ FRACTIONAL\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 2.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'123.2', '432.22', '32.1', '23.4'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"123.2\",\n", + " \"432.22\",\n", + " \"32.1\",\n", + " \"23.4\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"username\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'jd123', 'bobsmith', '_anna_', 'jane.doe@example.com'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"jd123\",\n", + " \"bobsmith\",\n", + " \"_anna_\",\n", + " \"jane.doe@example.com\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " \"valueConstraints\": {}\n", + "}\n" + ] + } + ], + "source": [ + "auto_constraints = profile.generate_constraints()\n", + "print(message_to_json(auto_constraints.to_protobuf()))" + ] + }, + { + "cell_type": "markdown", + "id": "77ea23ed", + "metadata": {}, + "source": [ + "For the columns with inferred type STRING, the generate constraints method generates 3 types of constraints: columnValuesTypeEqualsConstraint where the type is STRING, columnUniqueValueCountBetweenConstraint which makes a constraint that the unique values in a column should range between unique_count - 1 and unique_count + 1 in the current data frame, and finally columnMostCommonValueInSetConstraint which takes a set of the 5 most common values and defines a constraint that the most common value in this column should be in that set." + ] + }, + { + "cell_type": "markdown", + "id": "3f683579", + "metadata": {}, + "source": [ + "The columns which have inferred type FRACTIONAL or INTEGRAL, such as 'points' and 'followers' respectively, have numeric constraints generated such as minimum value greater than 0, maximum value less than 0, mean in range [mean - stddev, mean + stddev], if these constraints apply to the current column. Apart from these constraints, columnValuesTypeEqualsConstraint and columnMostCommonValueInSetConstraint are generated for both types. columnUniqueValueCountBetweenConstraint is generated only for the INTEGRAL valued columns." + ] + }, + { + "cell_type": "markdown", + "id": "5cd56524", + "metadata": {}, + "source": [ + "No constraints are generated for columns which have an inferred type of NULL." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/whylogs/core/columnprofile.py b/src/whylogs/core/columnprofile.py index 243569c8be..df8e1d6dd5 100644 --- a/src/whylogs/core/columnprofile.py +++ b/src/whylogs/core/columnprofile.py @@ -10,19 +10,18 @@ ) from whylogs.core.statistics.constraints import ( MultiColumnValueConstraints, - SummaryConstraint, SummaryConstraints, ValueConstraints, + columnMostCommonValueInSetConstraint, + columnUniqueValueCountBetweenConstraint, + columnValuesTypeEqualsConstraint, + maxLessThanEqualConstraint, + meanBetweenConstraint, + minGreaterThanEqualConstraint, ) from whylogs.core.statistics.hllsketch import HllSketch from whylogs.core.types import TypedDataConverter -from whylogs.proto import ( - ColumnMessage, - ColumnSummary, - InferredType, - Op, - UniqueCountSummary, -) +from whylogs.proto import ColumnMessage, ColumnSummary, InferredType, UniqueCountSummary from whylogs.util.dsketch import FrequentItemsSketch _TYPES = InferredType.Type @@ -181,11 +180,48 @@ def generate_constraints(self) -> SummaryConstraints: items = [] if self.number_tracker is not None and self.number_tracker.count > 0: summ = self.number_tracker.to_summary() - if summ.min > 0: - items = [SummaryConstraint(op=Op.GT, first_field="min", value=0)] - # generate additional constraints here - if len(items) > 0: - return SummaryConstraints(items) + + if summ.min >= 0: + items.append(minGreaterThanEqualConstraint(value=0)) + + mean_lower = summ.mean - summ.stddev + mean_upper = summ.mean + summ.stddev + + if mean_lower != mean_upper: + items.append( + meanBetweenConstraint( + lower_value=mean_lower, + upper_value=mean_upper, + ) + ) + + if summ.max <= 0: + items.append(maxLessThanEqualConstraint(value=0)) + + schema_summary = self.schema_tracker.to_summary() + inferred_type = schema_summary.inferred_type.type + if inferred_type not in (InferredType.UNKNOWN, InferredType.NULL): + items.append(columnValuesTypeEqualsConstraint(expected_type=inferred_type)) + + if self.cardinality_tracker and inferred_type != InferredType.FRACTIONAL: + unique_count = self.cardinality_tracker.to_summary() + if unique_count and unique_count.estimate > 0: + low = int(max(0, unique_count.lower - 1)) + up = int(unique_count.upper + 1) + items.append( + columnUniqueValueCountBetweenConstraint( + lower_value=low, + upper_value=up, + ) + ) + + frequent_items_summary = self.frequent_items.to_summary(max_items=5) + if frequent_items_summary and len(frequent_items_summary.items) > 0: + most_common_value_set = {val.json_value for val in frequent_items_summary.items} + items.append(columnMostCommonValueInSetConstraint(value_set=most_common_value_set)) + + if len(items) > 0: + return SummaryConstraints(items) return None diff --git a/src/whylogs/core/datasetprofile.py b/src/whylogs/core/datasetprofile.py index e2aa0a62e1..a9827f064d 100644 --- a/src/whylogs/core/datasetprofile.py +++ b/src/whylogs/core/datasetprofile.py @@ -167,7 +167,8 @@ def session_timestamp_ms(self): @property def total_row_number(self): - return max(self.column_row_dict.values()) + dict_counts = self.column_row_dict.values() if len(self.column_row_dict) else [0] + return max(dict_counts) def add_output_field(self, field: Union[str, List[str]]): if self.model_profile is None: @@ -311,17 +312,18 @@ def track_dataframe(self, df: pd.DataFrame, character_list=None, token_method=No large_df = element_count > 200000 if large_df: logger.warning(f"About to log a dataframe with {element_count} elements, logging might take some time to complete.") + count = 0 - columns_len = len(df.columns) num_records = len(df) for idx in range(num_records): - row_values = df.iloc[idx].values + row_values = [] count += 1 - for col_idx in range(columns_len): - col = df.columns[col_idx] - col_str = str(col) - self.track(col_str, row_values[col_idx], character_list=None, token_method=None) + for col in df.columns: + col_values = df[col].values + value = col_values[idx] + row_values.append(value) + self.track(col, value, character_list=None, token_method=None) if large_df and (count % 200000 == 0): logger.warning(f"Logged {count} elements out of {element_count}") diff --git a/src/whylogs/core/summaryconverters.py b/src/whylogs/core/summaryconverters.py index 483f4ea339..bf48dfa6eb 100644 --- a/src/whylogs/core/summaryconverters.py +++ b/src/whylogs/core/summaryconverters.py @@ -6,13 +6,13 @@ import datasketches import numpy as np -import scipy.special from datasketches import ( frequent_items_error_type, frequent_strings_sketch, kll_floats_sketch, update_theta_sketch, ) +from scipy import special, stats from whylogs.proto import ( ColumnSummary, @@ -234,7 +234,7 @@ def ks_test_compute_p_value(target_distribution: kll_floats_sketch, reference_di if D > D_max: D_max = D n_samples = min(target_distribution.get_n(), reference_distribution.get_n()) - p_value = scipy.special.kolmogorov(np.sqrt(n_samples) * D_max) + p_value = special.kolmogorov(np.sqrt(n_samples) * D_max) return type("Object", (), {"ks_test": p_value}) @@ -323,5 +323,5 @@ def compute_chi_squared_test_p_value(target_distribution: ReferenceDistributionD chi_sq += (i_frequency - ref_frequency) ** 2 / ref_frequency degrees_of_freedom = target_unique_count - 1 - p_value = scipy.stats.chi2.sf(chi_sq, degrees_of_freedom) + p_value = stats.chi2.sf(chi_sq, degrees_of_freedom) return type("Object", (), {"chi_squared_test": p_value}) diff --git a/tests/unit/core/statistics/test_constraints.py b/tests/unit/core/statistics/test_constraints.py index 07005e373d..59d9e5c0a3 100644 --- a/tests/unit/core/statistics/test_constraints.py +++ b/tests/unit/core/statistics/test_constraints.py @@ -66,7 +66,6 @@ def test_value_summary_serialization(): - for each_op, _ in _value_funcs.items(): if each_op == Op.APPLY_FUNC: continue @@ -101,7 +100,6 @@ def test_value_summary_serialization(): def test_value_constraints(df_lending_club, local_config_path): - conforming_loan = ValueConstraint(Op.LT, 548250) smallest_loan = ValueConstraint(Op.GT, 2500.0, verbose=True) @@ -126,7 +124,6 @@ def test_value_constraints(df_lending_club, local_config_path): def test_value_constraints_pattern_match(df_lending_club, local_config_path): - regex_state_abbreviation = r"^[a-zA-Z]{2}$" contains_state = ValueConstraint(Op.MATCH, regex_pattern=regex_state_abbreviation) @@ -300,7 +297,6 @@ def test_value_constraints_raw_and_coerced_types_report(): def test_summary_between_serialization_deserialization(): - # constraints may have an optional name sum_constraint = SummaryConstraint("min", Op.BTWN, 0.1, 2.4) msg_sum_const = sum_constraint.to_protobuf() @@ -375,7 +371,6 @@ def test_summary_between_constraints_fields(df_lending_club, local_config_path): def test_summary_between_constraints_no_merge_different_values_fields(): - std_dev_between1 = SummaryConstraint("stddev", Op.BTWN, value=0.1, upper_value=200) std_dev_between2 = SummaryConstraint("stddev", Op.BTWN, value=0.2, upper_value=200) @@ -531,7 +526,6 @@ def test_max_between_constraint_invalid(): def _apply_summary_constraints_on_dataset(df_lending_club, local_config_path, summary_constraints): - dc = DatasetConstraints(None, summary_constraints=summary_constraints) config = load_config(local_config_path) session = session_from_config(config) @@ -586,7 +580,6 @@ def test_set_summary_constraint_invalid_init(): def test_set_summary_no_merge_different_set(): - set_c_1 = SummaryConstraint("distinct_column_values", Op.CONTAIN_SET, reference_set=[1, 2, 3]) set_c_2 = SummaryConstraint("distinct_column_values", Op.CONTAIN_SET, reference_set=[2, 3, 4, 5]) with pytest.raises(AssertionError): @@ -728,7 +721,6 @@ def _apply_string_length_constraints(local_config_path, length_constraints): def test_string_length_constraints(local_config_path): - length_constraint7 = stringLengthEqualConstraint(length=7) length_constraint24 = stringLengthEqualConstraint(length=24) length_constraint7to10 = stringLengthBetweenConstraint(lower_value=7, upper_value=10) @@ -894,7 +886,6 @@ def _apply_apply_func_constraints(local_config_path, apply_func_constraints): def test_apply_func_value_constraints(local_config_path): - dateutil_parseable = dateUtilParseableConstraint() json_parseable = jsonParseableConstraint() @@ -2259,6 +2250,153 @@ def test_chi_squared_test_p_value_greater_than_constraint_wrong_datatype(): columnChiSquaredTestPValueGreaterThanConstraint(["a", "b", "c"], p_value=1.2, verbose=True) +def test_generate_default_constraints_categorical(local_config_path): + usernames = ["jd123", "jane.doe@example.com", "bobsmith", "_anna_"] + emails = ["john.doe@example.com", "jane.doe@example.com", "bob.smith@example.com", "anna_jones@example.com"] + data = pd.DataFrame( + { + "username": usernames, + "email": emails, + } + ) + config = load_config(local_config_path) + session = session_from_config(config) + profile = session.log_dataframe(data, "test.data") + generated_constraints = profile.generate_constraints() + + json_summ = json.loads(message_to_json(generated_constraints.to_protobuf())) + constraints_username = json_summ["summaryConstraints"]["username"]["constraints"] + constraints_email = json_summ["summaryConstraints"]["email"]["constraints"] + + # username constraints + assert len(constraints_username) == 3 # column value type equals, unique count between and most common value in set + assert constraints_username[0]["name"] == "summary column_values_type EQ STRING" + assert constraints_username[0]["firstField"] == "column_values_type" + assert constraints_username[0]["value"] == InferredType.STRING + assert constraints_username[0]["op"] == Op.Name(Op.EQ) + assert constraints_username[0]["verbose"] is False + + # there are 4 unique values in the df for username, so the unique count between is in the range 4-1 and 4+1 + assert constraints_username[1]["name"] == "summary unique_count BTWN 3 and 5" + assert constraints_username[1]["firstField"] == "unique_count" + assert constraints_username[1]["op"] == Op.Name(Op.BTWN) + assert pytest.approx(constraints_username[1]["between"]["lowerValue"], 0.001) == 3 + assert pytest.approx(constraints_username[1]["between"]["upperValue"], 0.001) == 5 + assert constraints_username[1]["verbose"] is False + + assert f"summary most_common_value IN" in constraints_username[2]["name"] # set has different order + assert constraints_username[2]["firstField"] == "most_common_value" + assert constraints_username[2]["op"] == Op.Name(Op.IN) + assert set(constraints_username[2]["referenceSet"]) == set(usernames) + assert constraints_username[2]["verbose"] is False + + # email constraints + assert len(constraints_email) == 3 # column value type equals, unique count between and most common value in set + assert constraints_email[0]["name"] == "summary column_values_type EQ STRING" + assert constraints_email[0]["firstField"] == "column_values_type" + assert constraints_email[0]["value"] == InferredType.STRING + assert constraints_email[0]["op"] == Op.Name(Op.EQ) + assert constraints_email[0]["verbose"] is False + + # there are 4 unique values in the df for username, so the unique count between is in the range 4-1 and 4+1 + assert constraints_email[1]["name"] == "summary unique_count BTWN 3 and 5" + assert constraints_email[1]["firstField"] == "unique_count" + assert constraints_email[1]["op"] == Op.Name(Op.BTWN) + assert pytest.approx(constraints_email[1]["between"]["lowerValue"], 0.001) == 3 + assert pytest.approx(constraints_email[1]["between"]["upperValue"], 0.001) == 5 + assert constraints_email[1]["verbose"] is False + + assert f"summary most_common_value IN" in constraints_email[2]["name"] # set has different order + assert constraints_email[2]["firstField"] == "most_common_value" + assert constraints_email[2]["op"] == Op.Name(Op.IN) + assert set(constraints_email[2]["referenceSet"]) == set(emails) + assert constraints_email[2]["verbose"] is False + + +def test_generate_default_constraints_numeric(local_config_path): + data = pd.DataFrame( + { + "followers": [1525, 12268, 51343, 867, 567, 100265, 22113, 3412], + "points": [23.4, 123.2, 432.22, 32.1, 44.1, 42.2, 344.2, 42.1], + } + ) + + config = load_config(local_config_path) + session = session_from_config(config) + profile = session.log_dataframe(data, "test.data") + generated_constraints = profile.generate_constraints() + + json_summ = json.loads(message_to_json(generated_constraints.to_protobuf())) + followers_constraints = json_summ["summaryConstraints"]["followers"]["constraints"] + points_constraints = json_summ["summaryConstraints"]["points"]["constraints"] + + assert len(followers_constraints) == 5 + # min greater than 0, mean between mean-stddev and mean+stddev, + # column values type, most common value in set, unique count between + + followers_mean = data["followers"].mean() + followers_stddev = data["followers"].std() + lower_followers = followers_mean - followers_stddev + upper_followers = followers_mean + followers_stddev + + assert followers_constraints[0]["name"] == "summary min GE 0/None" + assert followers_constraints[1]["name"] == f"summary mean BTWN {lower_followers} and {upper_followers}" + assert followers_constraints[2]["name"] == "summary column_values_type EQ INTEGRAL" + assert followers_constraints[3]["name"] == "summary unique_count BTWN 7 and 9" # we have 8 unique values in the df + assert "summary most_common_value IN" in followers_constraints[4]["name"] + + assert len(points_constraints) == 4 + # min greater than 0, mean between mean-stddev and mean+stddev, + # column values type, most common value in set + points_mean = data["points"].mean() + points_stddev = data["points"].std() + lower_points = points_mean - points_stddev + upper_points = points_mean + points_stddev + + assert points_constraints[0]["name"] == "summary min GE 0/None" + assert points_constraints[1]["name"] == f"summary mean BTWN {lower_points} and {upper_points}" + assert points_constraints[2]["name"] == "summary column_values_type EQ FRACTIONAL" + assert "summary most_common_value IN" in points_constraints[3]["name"] + + +def test_generate_default_constraints_mixed(local_config_path): + data = pd.DataFrame( + {"username": ["jd123", "jane.doe@example.com", "bobsmith", "_anna_"], "followers": [1525, 12268, 51343, 867], "null": [None, None, None, None]} + ) + + config = load_config(local_config_path) + session = session_from_config(config) + profile = session.log_dataframe(data, "test.data") + generated_constraints = profile.generate_constraints() + + json_summ = json.loads(message_to_json(generated_constraints.to_protobuf())) + username_constraints = json_summ["summaryConstraints"]["username"]["constraints"] + followers_constraints = json_summ["summaryConstraints"]["followers"]["constraints"] + + # no constraints should be generated for the null column since all values are None + assert "null" not in json_summ["summaryConstraints"] + + assert len(username_constraints) == 3 # column value type equals, unique count between and most common value in set + assert username_constraints[0]["name"] == "summary column_values_type EQ STRING" + assert username_constraints[1]["name"] == "summary unique_count BTWN 3 and 5" # we have 4 unique values in df + assert f"summary most_common_value IN" in username_constraints[2]["name"] + + assert len(followers_constraints) == 5 + # min greater than 0, mean between mean-stddev and mean+stddev, + # column values type, most common value in set, unique count between + + followers_mean = data["followers"].mean() + followers_stddev = data["followers"].std() + lower_followers = followers_mean - followers_stddev + upper_followers = followers_mean + followers_stddev + + assert followers_constraints[0]["name"] == "summary min GE 0/None" + assert followers_constraints[1]["name"] == f"summary mean BTWN {lower_followers} and {upper_followers}" + assert followers_constraints[2]["name"] == "summary column_values_type EQ INTEGRAL" + assert followers_constraints[3]["name"] == "summary unique_count BTWN 3 and 5" # we have 4 unique values in the df + assert "summary most_common_value IN" in followers_constraints[4]["name"] + + def _apply_value_constraints_on_dataset(df_lending_club, local_config_path, value_constraints=None, multi_column_value_constraints=None): dc = DatasetConstraints(None, value_constraints=value_constraints, multi_column_value_constraints=multi_column_value_constraints) config = load_config(local_config_path)