From af9a29878599df78aade9e7ccdd213efab32dbd0 Mon Sep 17 00:00:00 2001 From: pecop2 Date: Mon, 17 Jan 2022 14:31:36 +0100 Subject: [PATCH 01/10] Example notebook for the newly created constraints --- examples/Constraints2.ipynb | 1900 +++++++++++++++++++++++++++++++++++ 1 file changed, 1900 insertions(+) create mode 100644 examples/Constraints2.ipynb diff --git a/examples/Constraints2.ipynb b/examples/Constraints2.ipynb new file mode 100644 index 0000000000..b901828e8e --- /dev/null +++ b/examples/Constraints2.ipynb @@ -0,0 +1,1900 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### There is a specific function for common constraints. Should only continue to use the ValueConstraint and SummaryConstraint for creating a custom constraint that can't be found." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARN: Missing config\n" + ] + } + ], + "source": [ + "from whylogs import get_or_create_session\n", + "from whylogs.util.protobuf import message_to_json\n", + "\n", + "# create session\n", + "session = get_or_create_session()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "from tabulate import tabulate\n", + "\n", + "def indent(txt, spaces=4):\n", + " return \"\\n\".join(\" \" * spaces + ln for ln in txt.splitlines())\n", + "\n", + "def format_report(r):\n", + " # report failures in tabular form\n", + " \n", + " r_2 = [entry for entry in r if len(entry)==2] # all the single column constraints\n", + " r_table_shape = [[entry for entry in r if len(entry)!=2 and entry[0].startswith(\"table\")]] # multi column and table shape constraints\n", + " r_multi_column = [[entry for entry in r if len(entry)!=2 and entry[0].startswith(\"multi column\")]]\n", + " \n", + " if len(r_2):\n", + " print(\"Constraint failures by feature - \")\n", + " for c,r in r_2:\n", + " print(f\"{c}:\")\n", + " if len(r[0][0]) > 80: \n", + " print(f\"\\ntest_name:\\t{r[0][0]}\\n\")\n", + " print(f\"total_run:\\t{r[0][1]}\\n\")\n", + " print(f\"failed:\\t\\t{r[0][2]}\\n\")\n", + " else: \n", + " print(indent(tabulate(r, tablefmt=\"plain\", headers=['test_name', 'total_run', 'failed'])))\n", + " \n", + " if len(r_table_shape[0]):\n", + " print () \n", + " print(\"Table shape constraint failures -\")\n", + " for entry in r_table_shape:\n", + " print(indent(tabulate(entry, tablefmt=\"plain\", headers=['test_name', 'total_run', 'failed'])))\n", + " \n", + " if len(r_multi_column[0]):\n", + " print() \n", + " print(\"Multi column constraint failures -\")\n", + " for entry in r_multi_column:\n", + " if len(entry[0][0]) > 80: \n", + " print(f\"\\ntest_name:\\t{entry[0][0]}\\n\")\n", + " print(f\"total_run:\\t{entry[0][1]}\\n\")\n", + " print(f\"failed:\\t\\t{entry[0][2]}\\n\")\n", + " else:\n", + " print(indent(tabulate(entry, tablefmt=\"plain\", headers=['test_name', 'total_run', 'failed'])))\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Between summary constraints on summary fields like: stddev, min, max, mean..." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " maxBetweenConstraint,\n", + " maxLessThanEqualConstraint,\n", + " meanBetweenConstraint,\n", + " minBetweenConstraint,\n", + " minGreaterThanEqualConstraint,\n", + " stddevBetweenConstraint,\n", + " stringLengthBetweenConstraint,\n", + " stringLengthEqualConstraint,\n", + " quantileBetweenConstraint,\n", + " DatasetConstraints,\n", + " SummaryConstraints,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraints for column 'col1': \n", + "[\n", + " {\n", + " \"name\": \"summary max BTWN 5 and 10.8\",\n", + " \"firstField\": \"max\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 5.0,\n", + " \"upperValue\": 10.8\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary stddev BTWN 2.3 and 5.4\",\n", + " \"firstField\": \"stddev\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 2.3,\n", + " \"upperValue\": 5.4\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary min GE 1/None\",\n", + " \"firstField\": \"min\",\n", + " \"value\": 1.0,\n", + " \"op\": \"GE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + "]\n", + "\n", + "Constraints for column 'col2': \n", + "[\n", + " {\n", + " \"name\": \"summary mean BTWN 1.2 and 1.6\",\n", + " \"firstField\": \"mean\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 1.2,\n", + " \"upperValue\": 1.6\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary min BTWN 0.1 and 0.5\",\n", + " \"firstField\": \"min\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 0.1,\n", + " \"upperValue\": 0.5\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary quantile 0.15 BTWN 2 and 4.3\",\n", + " \"firstField\": \"quantile\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 2.0,\n", + " \"upperValue\": 4.3\n", + " },\n", + " \"quantileValue\": 0.15,\n", + " \"verbose\": false\n", + " }\n", + "]\n", + "\n", + "Constraints for column 'col3': \n", + "[\n", + " {\n", + " \"name\": \"summary max LE 100/None\",\n", + " \"firstField\": \"max\",\n", + " \"value\": 100.0,\n", + " \"op\": \"LE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + "]\n", + "\n" + ] + } + ], + "source": [ + "# define the specific types of constraints\n", + "# the ranges of the between constraints include the bouding values\n", + "\n", + "# check if the maximum value of the column is in the range [5, 10.8]\n", + "max_between_values = maxBetweenConstraint(lower_value=5, upper_value=10.8) \n", + "# check if the maximum value of the column is less than or equal to 100\n", + "max_less_than_equal_value = maxLessThanEqualConstraint(value=100)\n", + "# check if the mean of the column is in the range [1.2, 1.6] \n", + "mean_between_values = meanBetweenConstraint(lower_value=1.2, upper_value=1.6)\n", + "# check if the minimum value of the column is in the range [0.1, 0.5]\n", + "min_between_values = minBetweenConstraint(lower_value=0.1, upper_value=0.5)\n", + "# check if the minimum value of the column is greater than or equal to 1\n", + "min_greater_than_equal_value = minGreaterThanEqualConstraint(value=1)\n", + "# check if the standard deviation of the column is in the range [2.3, 5.4]\n", + "stddev_between_values = stddevBetweenConstraint(lower_value=2.3, upper_value=5.4)\n", + "# check if the 0.15 quantile value is in the range [2, 4.3]\n", + "quantile_between_values = quantileBetweenConstraint(quantile_value = 0.15, lower_value=2, upper_value=4.3) \n", + "\n", + "# example data frame with columns \"col1\",\"col2\", \"col3\"\n", + "# you can also read an existing data set using pandas, or as a numpy array\n", + "df = pd.DataFrame({\n", + " \"col1\": [4, 5, 6, 7],\n", + " \"col2\": [0, 1, 2, 3],\n", + " \"col3\": [50, 60, 80, 110]\n", + "})\n", + "\n", + "# bind the standard deviation between constraint to the dataframe column named \"col1\"\n", + "# bind the mean between constraint to the dataframe column named \"col2\"\n", + "# you can add multiple summary constrants for each column\n", + "dc = DatasetConstraints(None, summary_constraints={\n", + " \"col1\": [max_between_values, stddev_between_values, min_greater_than_equal_value], \n", + " \"col2\": [mean_between_values, min_between_values, quantile_between_values],\n", + " \"col3\": [max_less_than_equal_value]\n", + "}) \n", + "\n", + "# logging the dataframe creates a profile wiht summary statistics for the data set\n", + "# the data set profile contains column profiles with summary statistics for each column present in the data set\n", + "profile = session.log_dataframe(df, \"test.data\", constraints=dc)\n", + "\n", + "# serialize the DatasetConstraints to JSON\n", + "dc_json = json.loads(dc.to_json())\n", + "col1_constraints = json.dumps(dc_json['summaryConstraints']['col1']['constraints'], indent=4)\n", + "col2_constraints = json.dumps(dc_json['summaryConstraints']['col2']['constraints'], indent=4)\n", + "col3_constraints = json.dumps(dc_json['summaryConstraints']['col3']['constraints'], indent=4)\n", + "\n", + "print(f\"Constraints for column 'col1': \\n{col1_constraints}\\n\")\n", + "print(f\"Constraints for column 'col2': \\n{col2_constraints}\\n\")\n", + "print(f\"Constraints for column 'col3': \\n{col3_constraints}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Summary constraints are applied with apply_summary_constraints on the DatasetProfile." + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "col1:\n", + " test_name total_run failed\n", + " summary max BTWN 5 and 10.8 1 0\n", + " summary stddev BTWN 2.3 and 5.4 1 1\n", + " summary min GE 1/None 1 0\n", + "col2:\n", + " test_name total_run failed\n", + " summary mean BTWN 1.2 and 1.6 1 0\n", + " summary min BTWN 0.1 and 0.5 1 1\n", + " summary quantile 0.15 BTWN 2 and 4.3 1 1\n", + "col3:\n", + " test_name total_run failed\n", + " summary max LE 100/None 1 1\n" + ] + } + ], + "source": [ + "# summary constraints must be applied on the dataset profile, after logging the dataframe\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see **mean BTWN** passes and the **stddev BTWN** fails as they should." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary constraints for distinct, unique and most common values in a column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distinct values in a column" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " distinctValuesInSetConstraint, distinctValuesEqualSetConstraint, distinctValuesContainSetConstraint )" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "in_set = distinctValuesInSetConstraint(reference_set=set(range(1, 10)))\n", + "eq_set = distinctValuesEqualSetConstraint(reference_set={'a', 'a', 'a'})\n", + "contain_set = distinctValuesContainSetConstraint(reference_set={0, 1})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Applying summary constraints sent as an argument to apply_summary_constraints function on the same profile as before!" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "col1:\n", + " test_name total_run failed\n", + " summary distinct_column_values IN_SET {1, 2, 3, 4, 5, 6, 7, 8, 9} 1 0\n", + " summary distinct_column_values EQ_SET {'a'} 1 1\n", + "col2:\n", + " test_name total_run failed\n", + " summary distinct_column_values CONTAIN_SET {0, 1} 1 0\n" + ] + } + ], + "source": [ + "report = profile.apply_summary_constraints({'col1': SummaryConstraints([in_set, eq_set]), \n", + " 'col2': SummaryConstraints([contain_set])})\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unique column value count and proportion constraints" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " columnUniqueValueCountBetweenConstraint,\n", + " columnUniqueValueProportionBetweenConstraint,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# create a data set with customers, the country they live in, and their spending\n", + "customer_data = pd.DataFrame({\n", + " \"customer\": [\"c1\", \"c2\", \"c3\", \"c4\", \"c5\", \"c6\"],\n", + " \"country\": [\"Germany\", \"Italy\", \"Germany\", \"USA\", \"Germany\", \"UK\"],\n", + " \"spending\": [1200, 500, 700, 1500, 300, None]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "country:\n", + " test_name total_run failed\n", + " summary unique_count BTWN 1 and 5 1 0\n", + " summary unique_proportion BTWN 0.3 and 0.45 1 1\n" + ] + } + ], + "source": [ + "# check if there are between 1 and 5 unique values in the specific column\n", + "unique_value_count_between = columnUniqueValueCountBetweenConstraint(lower_value=1, upper_value=5)\n", + "# check if the proportion of unique values int he set is between 0.3 and 0.4 inclusive\n", + "unique_value_proportion_between = columnUniqueValueProportionBetweenConstraint(lower_fraction=0.3, upper_fraction=0.45)\n", + "dc = DatasetConstraints(None, summary_constraints={\"country\": [unique_value_count_between, unique_value_proportion_between]})\n", + "\n", + "# log the customer_data dataframe to obrain the profile\n", + "profile = session.log_dataframe(customer_data, 'test2.data', constraints=dc)\n", + "# summary constraints must be applied on the profile after the data set has been logged\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column most common value in set constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnMostCommonValueInSetConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "country:\n", + " test_name total_run failed\n", + " summary most_common_value IN {'Germany', 'Italy'} 1 0\n" + ] + } + ], + "source": [ + "# check if the most common value in the column is in the set {\"Germany\", \"Italy\"}\n", + "most_common_value_in_set = columnMostCommonValueInSetConstraint(value_set={\"Germany\", \"Italy\"})\n", + "# bind the constraint to the column named \"country\"\n", + "summary_constraint = {\"country\": [most_common_value_in_set]}\n", + "# apply the summary constraints on the same profile for the customer_data data set\n", + "report = profile.apply_summary_constraints(summary_constraint)\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column values not null" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesNotNullConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "customer:\n", + " test_name total_run failed\n", + " summary null_count EQ 0/None 1 0\n", + "spending:\n", + " test_name total_run failed\n", + " summary null_count EQ 0/None 1 1\n" + ] + } + ], + "source": [ + "# check if all values in the column are non-null\n", + "customer_value_not_null = columnValuesNotNullConstraint()\n", + "spending_value_not_null = columnValuesNotNullConstraint()\n", + "# bind the constraint to the column, there are no null values in the customer column, but there is one in the spending column\n", + "summary_constraint = {\"customer\": [customer_value_not_null], \"spending\": [spending_value_not_null]}\n", + "# apply the summary constraints on the same profile for the customer_data data set\n", + "report = profile.apply_summary_constraints(summary_constraint)\n", + "\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column value type equals or is in set constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " columnValuesTypeEqualsConstraint,\n", + " columnValuesTypeInSetConstraint\n", + ")\n", + "from whylogs.proto import InferredType" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "country:\n", + " test_name total_run failed\n", + " summary column_values_type EQ STRING 1 0\n", + "spending:\n", + " test_name total_run failed\n", + " summary column_values_type IN {'FRACTIONAL', 'INTEGRAL'} 1 0\n" + ] + } + ], + "source": [ + "# check if the values of the specified column are of type string\n", + "column_values_type_equals_string = columnValuesTypeEqualsConstraint(expected_type=InferredType.Type.STRING)\n", + "# check if the values of the specified column are either fractional or integral numbers\n", + "type_set = {InferredType.Type.FRACTIONAL, InferredType.Type.INTEGRAL}\n", + "column_value_types_in_set = columnValuesTypeInSetConstraint(type_set=type_set, verbose=True)\n", + "\n", + "column_type_summary_constraint = {\n", + " \"country\": [column_values_type_equals_string],\n", + " \"spending\": [column_value_types_in_set]\n", + "}\n", + "\n", + "# apply the summary constraints on the same profile for the customer_data data set\n", + "report = profile.apply_summary_constraints(column_type_summary_constraint)\n", + "# should not have failures since the country column type is string, and the spending column contains numbers\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Column values in set" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesInSetConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "grade:\n", + " test_name total_run failed\n", + " value IN {'A', 'B', 'E', 'C', 'F'} 5 1\n" + ] + } + ], + "source": [ + "student_grades = pd.DataFrame({\n", + " 'student_id': [1, 5, 15, 16, 22],\n", + " 'grade': ['C', 'C', 'A', '/', 'B']\n", + "})\n", + "\n", + "val_set = {'A', 'B', 'C', 'E', 'F'} # valid grades\n", + "column_values_in_set = columnValuesInSetConstraint(value_set=val_set)\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\n", + " \"grade\": [column_values_in_set], \n", + "})\n", + "\n", + "# the value constraints are applied at the time of logging the dataframe\n", + "profile = session.log_dataframe(student_grades, \"test.data\", constraints=dc)\n", + "\n", + "# out of the five sutdent's grades we expect to see one failure for the '/' unknown grade\n", + "# the total number of runs of the constraint should equal the number of values in the column\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regex matching constraints" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### String length value constraints using regex" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "str1:\n", + " test_name total_run failed\n", + " value MATCH ^.{7}$ 7 5\n", + " value MATCH ^.{7,10}$ 7 2\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import stringLengthEqualConstraint, stringLengthBetweenConstraint\n", + "df = pd.DataFrame(\n", + " [\n", + " {\"str1\": \"length7\"},\n", + " {\"str1\": \"length_8\"},\n", + " {\"str1\": \"length__9\"},\n", + " {\"str1\": \"a 10\"},\n", + " {\"str1\": \"11 b\"},\n", + " {\"str1\": '(*&^%^&*(24!@_+>:|}?><\"\\\\'},\n", + " {\"str1\": \"1b34567\"},\n", + " ]\n", + ")\n", + "length_constraint7 = stringLengthEqualConstraint(length=7)\n", + "length_constraint7to10 = stringLengthBetweenConstraint(lower_value=7, upper_value=10)\n", + "length_constraints = [length_constraint7, length_constraint7to10]\n", + "dc = DatasetConstraints(None, value_constraints={\"str1\": length_constraints})\n", + "\n", + "profile = session.log_dataframe(df, 'test2.data', constraints=dc)\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Email matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsEmailConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "email:\n", + "\n", + "test_name:\tvalue MATCH ^(?i)(?:[a-z0-9!#$%&\\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\\'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)$\n", + "\n", + "total_run:\t8\n", + "\n", + "failed:\t\t4\n", + "\n" + ] + } + ], + "source": [ + "customer_emails = pd.DataFrame([\n", + " {\"email\": r\"abc's@gmail.com\"}, # valid\n", + " {\"email\": r'\"aVrrR Test \\@\"@gmail.com'}, # valid (if wrapped in quotes, emails can contain special characters)\n", + " {\"email\": r\"abc..q12@example.us\"}, # invalid (two consecutive dots)\n", + " {\"email\": r'\"sdsss\\d\"@gmail.com'}, # valid\n", + " {\"email\": r\"customer/department=shipping?@example-another.some-other.us\"}, # valid\n", + " {\"email\": r\".should_fail@yahoo.com\"}, # invalid (must not start wiht dot)\n", + " {\"email\": r\"some.@a.com\"}, # invalid (must not contain a dot directly before the @ symbol)\n", + " {\"email\": r\"abs@yahoo.\"}, # invalid (must not end with a dot)\n", + "])\n", + "\n", + "# use the predefined email regex from whylogs\n", + "default_contains_email_constraint = containsEmailConstraint()\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\"email\": [default_contains_email_constraint]})\n", + "\n", + "profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)\n", + "# we expect 4 of the 8 runs to be failures\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsEmailConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "email:\n", + " test_name total_run failed\n", + " value MATCH \\S+@\\S+ 8 1\n" + ] + } + ], + "source": [ + "# you can provide your own email regex and check the values against it\n", + "custom_contains_email_constraint = containsEmailConstraint(regex_pattern = r\"\\S+@\\S+\")\n", + "dc = DatasetConstraints(None, value_constraints={\"email\": [custom_contains_email_constraint]})\n", + "\n", + "profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)\n", + "# now we expect 1 of the 8 runs to be failures, the email that contains white spaces\n", + "format_report(dc.report())\n", + "# running the containsEmailConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Credit Card matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsCreditCardConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "credit_card:\n", + "\n", + "test_name:\tvalue MATCH ^(?:(4[0-9]{3}([\\s-]?[0-9]{4}){2}[\\s-]?[0-9]{1,4})|(?:(5[1-5][0-9]{2}([\\s-]?[0-9]{4}){3}))|(?:(6(?:011|5[0-9]{2})([\\s-]?[0-9]{4}){3}))|(?:(3[47][0-9]{2}[\\s-]?[0-9]{6}[\\s-]?[0-9]{5}))|(?:(3(?:0[0-5]|[68][0-9])[0-9][\\s-]?[0-9]{6}[\\s-]?[0-9]{4}))|(?:2131|1800|35[0-9]{2,3}([\\s-]?[0-9]{4}){3}))$\n", + "\n", + "total_run:\t19\n", + "\n", + "failed:\t\t5\n", + "\n" + ] + } + ], + "source": [ + "credit_cards = pd.DataFrame(\n", + " [\n", + " {\"credit_card\": \"3714-496353-98431\"}, # amex\n", + " {\"credit_card\": \"3787 344936 71000\"}, # amex\n", + " {\"credit_card\": \"3056 930902 5904\"}, # diners club\n", + " {\"credit_card\": \"3065 133242 2899\"}, # invalid\n", + " {\"credit_card\": \"3852-000002-3237\"}, # diners club\n", + " {\"credit_card\": \"6011 1111 1111 1117\"}, # discover\n", + " {\"credit_card\": \"6011-0009-9013-9424\"}, # discover\n", + " {\"credit_card\": \"3530 1113 3330 0000\"}, # jcb\n", + " {\"credit_card\": \"3566-0020-2036-0505\"}, # jcb\n", + " {\"credit_card\": \"5555 5555 5555 4444\"}, # master card\n", + " {\"credit_card\": \"5105 1051 0510 5100\"}, # master card\n", + " {\"credit_card\": \"4111 1111 1111 1111\"}, # visa\n", + " {\"credit_card\": \"4012 8888 8888 1881\"}, # visa\n", + " {\"credit_card\": \"4222-2222-2222-2222\"}, # visa\n", + " {\"credit_card\": \"1111-1111-1111-1111\"}, # invalid\n", + " {\"credit_card\": \"a4111 1111 1111 1111b\"}, # invalid\n", + " {\"credit_card\": \"4111111111111111\"}, # visa\n", + " {\"credit_card\": 12345}, # invalid\n", + " {\"credit_card\": \"absfcvs\"}, # invalid\n", + " ]\n", + ")\n", + "\n", + "default_credit_card_constraint = containsCreditCardConstraint()\n", + "dc = DatasetConstraints(None, value_constraints={\"credit_card\": [default_credit_card_constraint]})\n", + "\n", + "profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)\n", + "# now we expect 5 of the 19 runs to be failures, the invalid credit cards\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsCreditCardConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "credit_card:\n", + " test_name total_run failed\n", + " value MATCH ^(?:[0-9]{4}[\\s-]?){3,4}$ 19 8\n" + ] + } + ], + "source": [ + "# you can provide your own credit card regex and check the values against it\n", + "custom_credit_card_constraint = containsCreditCardConstraint(regex_pattern = r\"^(?:[0-9]{4}[\\s-]?){3,4}$\")\n", + "dc = DatasetConstraints(None, value_constraints={\"credit_card\": [custom_credit_card_constraint]})\n", + "\n", + "profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)\n", + "# now more valid credit cards are being reported as failures\n", + "format_report(dc.report())\n", + "# running the containsCreditCardConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SSN regex matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsSSNConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "ssn:\n", + "\n", + "test_name:\tvalue MATCH ^(?!000|666|9[0-9]{2})[0-9]{3}[\\s-]?(?!00)[0-9]{2}[\\s-]?(?!0000)[0-9]{4}$\n", + "\n", + "total_run:\t8\n", + "\n", + "failed:\t\t4\n", + "\n" + ] + } + ], + "source": [ + "ssn_data = pd.DataFrame([\n", + " {\"ssn\": \"123-01-2335\"}, # valid\n", + " {\"ssn\": \"039780012\"}, # valid\n", + " {\"ssn\": \"000231324\"}, # invalid\n", + " {\"ssn\": \"666781132\"}, # invalid\n", + " {\"ssn\": \"926-89-1234\"}, # invalid\n", + " {\"ssn\": \"001-01-0001\"}, # valid\n", + " {\"ssn\": \"122 23 0001\"}, # valid\n", + " {\"ssn\": \"1234-12-123\"}, # invalid\n", + "])\n", + "\n", + "default_ssn_constraint = containsSSNConstraint()\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\"ssn\": [default_ssn_constraint]})\n", + "\n", + "profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)\n", + "# now we expect 4 of the 8 runs to be failures, the invalid ssn numbers\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsSSNConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "ssn:\n", + " test_name total_run failed\n", + " value MATCH ^[0-9]{3}-[0-9]{2}-[0-9]{4}$ 8 5\n" + ] + } + ], + "source": [ + "# you can provide your own ssn regex and check the values against it\n", + "custom_ssn_constraint = containsSSNConstraint(regex_pattern = r\"^[0-9]{3}-[0-9]{2}-[0-9]{4}$\")\n", + "dc = DatasetConstraints(None, value_constraints={\"ssn\": [custom_ssn_constraint]})\n", + "\n", + "profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)\n", + "# now more valid ssn numbers are being reported as failures\n", + "format_report(dc.report())\n", + "# running the containsSSNConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### URL regex matching constraint" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import containsURLConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "url:\n", + "\n", + "test_name:\tvalue MATCH ^(?:http(s)?:\\/\\/)?((www)|(?:[a-zA-z0-9-]+)\\.)(?:[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.(?:[a-zA-Z0-9]{1,6})\\b(?:[-a-zA-Z0-9@:%_\\+.~#?&//=]*))$\n", + "\n", + "total_run:\t10\n", + "\n", + "failed:\t\t4\n", + "\n" + ] + } + ], + "source": [ + "web_urls = pd.DataFrame([\n", + " {\"url\": \"http://www.example.com\"}, # valid\n", + " {\"url\": \"abc.test.com\"}, # valid (without protocol)\n", + " {\"url\": \"abc.w23w.asb#abc?a=2\"}, # valid (without protocol)\n", + " {\"url\": \"https://ab.abc.bc\"}, # valid\n", + " {\"url\": \"a.b.c\"}, # valid\n", + " {\"url\": \"abcd\"}, # invalid\n", + " {\"url\": \"123.w23.235\"}, # valid\n", + " {\"url\": \"asf://saf.we.12\"}, # invalid\n", + " {\"url\": \"12345\"}, # invalid\n", + " {\"url\": \"1.2\"}, # invalid\n", + " \n", + "])\n", + "\n", + "default_url_constraint = containsURLConstraint()\n", + "dc = DatasetConstraints(None, value_constraints={\"url\": [default_url_constraint]})\n", + "\n", + "profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)\n", + "# now we expect the 4 invalid urls, out of the 10 in total, to be reported as failures\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: supplying your own regex pattern might cause slower evaluation of the containsURLConstraint, depending on its complexity.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "url:\n", + " test_name total_run failed\n", + " value MATCH ^http(s)?:\\/\\/(www\\.)?.+\\..+$ 10 8\n" + ] + } + ], + "source": [ + "# you can provide your own ur; regex and check the values against it\n", + "custom_url_constraint = containsURLConstraint(regex_pattern = r\"^http(s)?:\\/\\/(www\\.)?.+\\..+$\")\n", + "dc = DatasetConstraints(None, value_constraints={\"url\": [custom_url_constraint]})\n", + "\n", + "profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)\n", + "# with the new regex more valid urls are being reported as failures\n", + "format_report(dc.report())\n", + "# running the containsURLConstraint with your own regex pattern may cause slow evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Datetime/json constraints" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "str1:\n", + " test_name total_run failed\n", + " value APPLY_FUNC _try_parse_dateutil 14 9\n", + " value APPLY_FUNC _try_parse_json 14 12\n", + " value APPLY_FUNC _matches_json_schema 14 12\n", + " value APPLY_FUNC _try_parse_strftime_format 14 12\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " dateUtilParseableConstraint, jsonParseableConstraint, matchesJsonSchemaConstraint, strftimeFormatConstraint )\n", + "df = pd.DataFrame(\n", + " [\n", + " {\"str1\": \"1990-12-1\"}, # dateutil valid; strftime valid\n", + " {\"str1\": \"1990/12/1\"},\n", + " {\"str1\": \"today is 2019-03-27\"}, # dateutil invalid\n", + " {\"str1\": \"Monday at 12:01am\"},\n", + " {\"str1\": \"xyz_not_a_date\"}, # dateutil invalid\n", + " {\"str1\": \"yesterday\"}, # dateutil invalid\n", + " {\"str1\": {\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232, \"abc\": 1}}, # schema valid\n", + " {\"str1\": {\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232}}, # schema invalid\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232, \"abc\": 1})}, # json valid, schema valid\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": \"232\", \"abc\": 1})}, # json valid\n", + " {\"str1\": \"random str : fail everything\"},\n", + " {\"str1\": \"2003-12-23\"}, # strftime valid, dateutil valid\n", + " {\"str1\": \"2003-15-23\"}, # strftime invalid, dateutil invalid\n", + " {\"str1\": \"10-12-32\"}, # strftime invalid, dateutil valid\n", + " ]\n", + " )\n", + "\n", + "dateutil_parseable = dateUtilParseableConstraint()\n", + "json_parseable = jsonParseableConstraint()\n", + "\n", + "json_schema = {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": {\"type\": \"string\"},\n", + " \"years\": {\"type\": \"integer\"},\n", + " },\n", + " \"required\": [\"name\", \"abc\"],\n", + " }\n", + "matches_json_schema = matchesJsonSchemaConstraint(json_schema=json_schema)\n", + "\n", + "is_strftime = strftimeFormatConstraint(format=\"%Y-%m-%d\")\n", + "\n", + "apply_func_constraints = [dateutil_parseable, json_parseable, matches_json_schema, is_strftime]\n", + "\n", + "\n", + "dc = DatasetConstraints(None, value_constraints={\"str1\": apply_func_constraints})\n", + "profile = session.log_dataframe(df, 'test3.data', constraints=dc)\n", + "\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Seeing the comments above, when creating the dataset, we can realize which values fail or pass, for which constraint. The dateutil constraint has 5 passing values in the dataset, and the other 3 constraints have only 2 values that pass from total of 14." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Entropy and Distributional Measures" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Entropy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check if the colmn entropy is in some interval [a, b]. Works both for discrete and continuous valued columns." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import approximateEntropyBetweenConstraint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Entropy on categorical data" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "pets = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.3, 0.1, 0.2, 0.4])\n", + "pet_df = pd.DataFrame({\n", + " \"pet\": pets\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary entropy BTWN 0.7 and 1.9 1 0\n" + ] + } + ], + "source": [ + "# check if the entropy of the pet_df 'pet' column is between 0.7 and 1.9 (the actual value is 1.85)\n", + "entropy_between_values_constraint = approximateEntropyBetweenConstraint(lower_value=0.7, upper_value=1.9)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [entropy_between_values_constraint]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to complete without failures\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Entropy on continuous data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# sample 100 data points from normal ditribution with mean 30000 and standard deviation 15000 to represent sales values\n", + "sales = np.random.normal(loc=30000, scale=15000, size=100)\n", + "\n", + "sales_df = pd.DataFrame({\n", + " \"sales\": sales\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "sales:\n", + " test_name total_run failed\n", + " summary entropy BTWN 2.3 and 3.5 1 1\n" + ] + } + ], + "source": [ + "# check if the entropy of the sales_df 'sales' column is between 2.3 and 3.5 (the actual value is 1.85)\n", + "entropy_between_values_constraint_cont = approximateEntropyBetweenConstraint(lower_value=2.3, upper_value=3.5)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"sales\": [entropy_between_values_constraint_cont]})\n", + "\n", + "profile = session.log_dataframe(sales_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint fail since entropy is between 3.8 and 3.9\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### KS Test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The KS Test can only be executed on continuous data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import parametrizedKSTestPValueGreaterThanConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# this would be the reference distribution, sales 2020\n", + "sales_2020 = np.random.normal(loc=30000, scale=15000, size=100)\n", + "# this would be the target distribution, sales 2021\n", + "sales_2021 = np.random.normal(loc=45000, scale=10000, size=100)\n", + "# we want to check if the sales in 2020 have the same distribution as the sales in 2021" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "sales:\n", + " test_name total_run failed\n", + " summary ks_test p-value GT 0.05 1 1\n" + ] + } + ], + "source": [ + "sales_2021_df = pd.DataFrame({\n", + " \"sales\": sales_2021\n", + "})\n", + "\n", + "# check if the p-value of the ks test for refrenece distribution sales_2020 is greater than 0.05 \n", + "# if so, we do not reject the null hypothesis\n", + "ks_test_p_value_greater_than = parametrizedKSTestPValueGreaterThanConstraint(reference_distribution=sales_2020, p_value=0.05)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"sales\": [ks_test_p_value_greater_than]})\n", + "\n", + "profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail since entropy is between 3.8 and 3.9\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The p-value is less than 0.05, which means we can reject the null hypothesis with this confidence level." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### KL Divergence" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The KL Divergence constraint is supported for both discrete and continuous variables." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnKLDivergenceLessThanConstraint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### KL Divergence for continuous case" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "sales:\n", + " test_name total_run failed\n", + " summary kl_divergence threshold LT 0.6 1 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: divide by zero encountered in true_divide\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n", + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: invalid value encountered in true_divide\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n", + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: divide by zero encountered in log\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n", + "/home/milena/Documents/GitHub/whylogs/src/whylogs/core/summaryconverters.py:184: RuntimeWarning: invalid value encountered in multiply\n", + " kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))\n" + ] + } + ], + "source": [ + "# check if the kl divergence is greater than 0.6 \n", + "kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=sales_2020, threshold=0.6)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"sales\": [kl_divergence_greater_than]})\n", + "\n", + "profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The distribution of sales in 2020 cannot be encoded with the distribution of sales in 2021." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### KL Divergence for discrete case" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary kl_divergence threshold LT 0.6 1 0\n" + ] + } + ], + "source": [ + "# create a new distribtution from the pets sample with different probabilities\n", + "pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.5, 0.1, 0.2, 0.2])\n", + "\n", + "# check if the kl divergence is greater than 0.6 \n", + "kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=pets_reference, threshold=0.6)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [kl_divergence_greater_than]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Chi-Squared Test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Chi-Squared test constraint is only supported for categorical values." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnChiSquaredTestPValueGreaterThanConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary chi_squared_test p-value GT 0.05 1 0\n" + ] + } + ], + "source": [ + "# create a new distribtution from the pets sample with different probabilities\n", + "pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.6, 0.2, 0.1, 0.1])\n", + "\n", + "# check if the kl divergence is greater than 0.6 \n", + "chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=pets_reference, p_value=0.05)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [chi_squared_p_value_greater_than]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to fail\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table shape constraints" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 1 1\n", + " table total_row_number EQ 14 1 0\n", + " table columns CONTAIN this_column_does_not_exist 1 1\n", + " table columns CONTAIN col2 1 0\n", + " table columns EQ {'this', 'is', 'set', 'wrong', 'columns', 'a'} 1 1\n", + " table columns EQ {'str1', 'col2'} 1 0\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import (\n", + " numberOfRowsConstraint, columnExistsConstraint, columnsMatchSetConstraint )\n", + "\n", + "# using the above dataframe with the string values, just adding a column\n", + "df['col2'] = range(len(df))\n", + "\n", + "rows = numberOfRowsConstraint(n_rows=len(df)+1) # fail\n", + "rows_2 = numberOfRowsConstraint(n_rows=len(df)) # pass\n", + "\n", + "column_exist = columnExistsConstraint(\"this_column_does_not_exist\") # fail\n", + "column_exist2 = columnExistsConstraint(\"col2\") # pass\n", + "\n", + "set1 = {'this', 'is', 'a', 'wrong', 'columns', 'set'}\n", + "columns_set = set(df.columns)\n", + "columns_match = columnsMatchSetConstraint(set1) # fail\n", + "columns_match2 = columnsMatchSetConstraint(columns_set) # pass\n", + "\n", + "table_shape_constraints = [rows, rows_2, column_exist, column_exist2, columns_match, columns_match2]\n", + "\n", + "dc = DatasetConstraints(None, table_shape_constraints=table_shape_constraints)\n", + "\n", + "profile = session.log_dataframe(df, \"test.data\", constraints=dc)\n", + "\n", + "report = profile.apply_table_shape_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multi column constraints\n", + "### Logical operations between values of the specified columns" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + " test_name total_run failed\n", + " multi column value col1 GT col2 4 2\n", + " multi column value col1 EQ col2 4 3\n" + ] + } + ], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesAGreaterThanBConstraint, columnValuesAEqualBConstraint\n", + "\n", + "df = pd.DataFrame({\"col1\": [4, 5, 6, 7], \"col2\": [0, 1, 6, 15]})\n", + "\n", + "a_gt_b = columnValuesAGreaterThanBConstraint(column_A=\"col1\", column_B=\"col2\")\n", + "a_eq_b = columnValuesAEqualBConstraint(column_A=\"col1\", column_B=\"col2\")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[a_gt_b, a_eq_b])\n", + "\n", + "profile = session.log_dataframe(df, \"test4.data\", constraints=dc)\n", + "\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Value by value comparison. col1 values > col2 values, only 2 are passing, and col1 values == col 2 values only 1 is True (the third element from both the columns are equal)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sum of row values of multiple columns equals some value, or some column value" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import sumOfRowValuesOfMultipleColumnsEqualsConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "total_expences = pd.DataFrame({\n", + " \"employees %\": [25, 45, 15, 3],\n", + " \"equipment %\": [10, 12, 4, 9],\n", + " \"materials %\": [40, 35, 45, 55],\n", + " \"other %\": [25, 8, 4, 6]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + "\n", + "test_name:\tmulti column value SUM ['employees %', 'equipment %', 'materials %', 'other %'] EQ 100\n", + "\n", + "total_run:\t4\n", + "\n", + "failed:\t\t2\n", + "\n" + ] + } + ], + "source": [ + "# check if the percentage of expences for each part sum to 100 %\n", + "sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(\n", + " columns=[\"employees %\", \"equipment %\", \"materials %\", \"other %\"],\n", + " value=100\n", + ")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])\n", + "\n", + "# the multicolumn value constraints do not need to be applied to the data \n", + "# the are applied at the time of logging\n", + "profile = session.log_dataframe(total_expences, \"test.data\", constraints=dc)\n", + "\n", + "# we expect 2 of the 4 rows to be failures since the last two rows do not sum to 100\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + " test_name total_run failed\n", + " multi column value SUM ['equipment %', 'materials %'] EQ ['other %'] 4 4\n" + ] + } + ], + "source": [ + "# check if the sum of the row values (percentages) for 'equipment %' and 'materials %' equalt the value of 'other %'\n", + "sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(\n", + " columns=[\"equipment %\", \"materials %\"],\n", + " value='other %'\n", + ")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])\n", + "profile = session.log_dataframe(total_expences, \"test.data\", constraints=dc)\n", + "\n", + "# we expect all rows to be failures since the sum of 'equipment %' and 'materials %' is not equal to the value of the column 'other %'\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column Pair Values in Set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check if the values of a pair of columns are in a predefined set of pair values." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnPairValuesInSetConstraint" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "product_grades = pd.DataFrame({\n", + " \"product\": [\"ProductA\", \"ProductB\", \"ProductC\", \"ProductD\", \"ProductE\"],\n", + " \"grade\": [\"A\", \"A\", \"B\", \"C\", \"C\"],\n", + " \"subgrade\": [\"A1\", \"A3\", \"B2\", \"C2\", \"C2\"]\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + "\n", + "test_name:\tmulti column value ['grade', 'subgrade'] IN {('C', 'C2'), ('B', 'B2'), ('B', 'B1'), ('A', 'A1'), ('A', 'A2'), ('C', 'C1')}\n", + "\n", + "total_run:\t5\n", + "\n", + "failed:\t\t1\n", + "\n" + ] + } + ], + "source": [ + "# we want to check if each of the grade and subgrade pairs are in the specific set\n", + "grade_subgrade_pairs_in_set = columnPairValuesInSetConstraint(\n", + " column_A=\"grade\", \n", + " column_B=\"subgrade\",\n", + " value_set = {(\"A\", \"A1\"), (\"A\", \"A2\"), (\"B\", \"B1\"), (\"B\", \"B2\"), (\"C\", \"C1\"), (\"C\", \"C2\")}\n", + ")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[grade_subgrade_pairs_in_set])\n", + "profile = session.log_dataframe(product_grades, \"test.data\", constraints=dc)\n", + "\n", + "# we expect 1 out of 5 pairs to be a failure, specifically (\"A\", \"A3\")\n", + "format_report(dc.report())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column Values Unique within Row" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check if the value of the specified column is unique within each row." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "from whylogs.core.statistics.constraints import columnValuesUniqueWithinRow" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "users = pd.DataFrame({\n", + " \"first_name\": [\"John\", \"Jane\", \"Bob\", \"Anna\"],\n", + " \"last_name\": [\"Doe\", \"Doe\", \"Smith\", \"Jones\"],\n", + " \"username\": [\"jd123\", \"jane.doe@example.com\", \"bobsmith\", \"_anna_\"],\n", + " \"email\": [\"john.doe@example.com\", \"jane.doe@example.com\", \"bob.smith@example.com\", \"anna_jones@example.com\"],\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Multi column constraint failures -\n", + " test_name total_run failed\n", + " multi column value email NOT_IN all 4 1\n" + ] + } + ], + "source": [ + "# check if the emails are unique compared to other fields for each user\n", + "# suppose we do not want to accept a username which is the same as the user's email\n", + "email_values_unique_within_row = columnValuesUniqueWithinRow(column_A=\"email\")\n", + "\n", + "dc = DatasetConstraints(None, multi_column_value_constraints=[email_values_unique_within_row])\n", + "profile = session.log_dataframe(users, \"test.data\", constraints=dc)\n", + "\n", + "# we expect 1 out of 4 evaluations of the constraint to be a failure, sicne Jane Doe's email is the same as their username\n", + "format_report(dc.report())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a17416a98203c9e19c1899addd6b8730f552fa52 Mon Sep 17 00:00:00 2001 From: pecop2 Date: Mon, 17 Jan 2022 15:29:08 +0100 Subject: [PATCH 02/10] Added Chi squared constraint example supplying a map of items and frequencies as counts --- examples/Constraints2.ipynb | 172 ++++++++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 7 deletions(-) diff --git a/examples/Constraints2.ipynb b/examples/Constraints2.ipynb index b901828e8e..5bef3dba9e 100644 --- a/examples/Constraints2.ipynb +++ b/examples/Constraints2.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "f13777a7", "metadata": {}, "source": [ "### There is a specific function for common constraints. Should only continue to use the ValueConstraint and SummaryConstraint for creating a custom constraint that can't be found." @@ -10,7 +11,10 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "id": "b32c0dc0", + "metadata": { + "scrolled": false + }, "outputs": [ { "name": "stdout", @@ -31,6 +35,7 @@ { "cell_type": "code", "execution_count": 2, + "id": "e0591c0c", "metadata": {}, "outputs": [], "source": [ @@ -41,7 +46,8 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 3, + "id": "f5ba8afd", "metadata": {}, "outputs": [], "source": [ @@ -89,6 +95,7 @@ }, { "cell_type": "markdown", + "id": "7b6068d2", "metadata": {}, "source": [ "## Between summary constraints on summary fields like: stddev, min, max, mean..." @@ -97,6 +104,7 @@ { "cell_type": "code", "execution_count": 4, + "id": "bfd0b5a7", "metadata": {}, "outputs": [], "source": [ @@ -118,6 +126,7 @@ { "cell_type": "code", "execution_count": 133, + "id": "71c0b923", "metadata": { "scrolled": false }, @@ -265,6 +274,7 @@ }, { "cell_type": "markdown", + "id": "21f3cc64", "metadata": {}, "source": [ "#### Summary constraints are applied with apply_summary_constraints on the DatasetProfile." @@ -273,6 +283,7 @@ { "cell_type": "code", "execution_count": 136, + "id": "5625be78", "metadata": { "scrolled": false }, @@ -306,6 +317,7 @@ }, { "cell_type": "markdown", + "id": "a42655a1", "metadata": {}, "source": [ "As we can see **mean BTWN** passes and the **stddev BTWN** fails as they should." @@ -313,6 +325,7 @@ }, { "cell_type": "markdown", + "id": "037e5ee9", "metadata": {}, "source": [ "## Summary constraints for distinct, unique and most common values in a column" @@ -320,6 +333,7 @@ }, { "cell_type": "markdown", + "id": "b7daf512", "metadata": {}, "source": [ "### Distinct values in a column" @@ -328,6 +342,7 @@ { "cell_type": "code", "execution_count": 6, + "id": "10e1e6ac", "metadata": {}, "outputs": [], "source": [ @@ -338,6 +353,7 @@ { "cell_type": "code", "execution_count": 137, + "id": "e2ae7cff", "metadata": {}, "outputs": [], "source": [ @@ -348,6 +364,7 @@ }, { "cell_type": "markdown", + "id": "151629fd", "metadata": {}, "source": [ "#### Applying summary constraints sent as an argument to apply_summary_constraints function on the same profile as before!" @@ -356,6 +373,7 @@ { "cell_type": "code", "execution_count": 138, + "id": "d5370632", "metadata": {}, "outputs": [ { @@ -381,6 +399,7 @@ }, { "cell_type": "markdown", + "id": "dcbdd65c", "metadata": {}, "source": [ "### Unique column value count and proportion constraints" @@ -389,6 +408,7 @@ { "cell_type": "code", "execution_count": 4, + "id": "221efa41", "metadata": {}, "outputs": [], "source": [ @@ -401,6 +421,7 @@ { "cell_type": "code", "execution_count": 11, + "id": "e0b04634", "metadata": {}, "outputs": [], "source": [ @@ -415,6 +436,7 @@ { "cell_type": "code", "execution_count": 12, + "id": "fc144d79", "metadata": {}, "outputs": [ { @@ -445,6 +467,7 @@ }, { "cell_type": "markdown", + "id": "89bad55b", "metadata": {}, "source": [ "### Column most common value in set constraint" @@ -453,6 +476,7 @@ { "cell_type": "code", "execution_count": 7, + "id": "cf56143a", "metadata": {}, "outputs": [], "source": [ @@ -462,6 +486,7 @@ { "cell_type": "code", "execution_count": 8, + "id": "3f6f9c61", "metadata": {}, "outputs": [ { @@ -487,6 +512,7 @@ }, { "cell_type": "markdown", + "id": "b67689b0", "metadata": {}, "source": [ "### Column values not null" @@ -495,6 +521,7 @@ { "cell_type": "code", "execution_count": 9, + "id": "10453a81", "metadata": {}, "outputs": [], "source": [ @@ -504,6 +531,7 @@ { "cell_type": "code", "execution_count": 12, + "id": "e1cbc8a2", "metadata": {}, "outputs": [ { @@ -534,6 +562,7 @@ }, { "cell_type": "markdown", + "id": "851f72c4", "metadata": {}, "source": [ "### Column value type equals or is in set constraint" @@ -542,6 +571,7 @@ { "cell_type": "code", "execution_count": 8, + "id": "30e2f233", "metadata": {}, "outputs": [], "source": [ @@ -555,6 +585,7 @@ { "cell_type": "code", "execution_count": 10, + "id": "f7a61d75", "metadata": {}, "outputs": [ { @@ -591,6 +622,7 @@ }, { "cell_type": "markdown", + "id": "aaaaa2dd", "metadata": {}, "source": [ "# Column values in set" @@ -599,6 +631,7 @@ { "cell_type": "code", "execution_count": 22, + "id": "3c669679", "metadata": {}, "outputs": [], "source": [ @@ -608,6 +641,7 @@ { "cell_type": "code", "execution_count": 29, + "id": "e7acd883", "metadata": {}, "outputs": [ { @@ -644,6 +678,7 @@ }, { "cell_type": "markdown", + "id": "6a9cb3aa", "metadata": {}, "source": [ "# Regex matching constraints" @@ -651,6 +686,7 @@ }, { "cell_type": "markdown", + "id": "4661890e", "metadata": {}, "source": [ "### String length value constraints using regex" @@ -659,6 +695,7 @@ { "cell_type": "code", "execution_count": 90, + "id": "ea7d2164", "metadata": {}, "outputs": [ { @@ -697,6 +734,7 @@ }, { "cell_type": "markdown", + "id": "f46a455e", "metadata": {}, "source": [ "### Email matching constraint" @@ -705,6 +743,7 @@ { "cell_type": "code", "execution_count": 60, + "id": "b679674c", "metadata": {}, "outputs": [], "source": [ @@ -714,6 +753,7 @@ { "cell_type": "code", "execution_count": 108, + "id": "041bd248", "metadata": {}, "outputs": [ { @@ -757,6 +797,7 @@ { "cell_type": "code", "execution_count": 111, + "id": "4d4f02e3", "metadata": {}, "outputs": [ { @@ -790,6 +831,7 @@ }, { "cell_type": "markdown", + "id": "c959c36e", "metadata": {}, "source": [ "### Credit Card matching constraint" @@ -798,6 +840,7 @@ { "cell_type": "code", "execution_count": 112, + "id": "73901092", "metadata": {}, "outputs": [], "source": [ @@ -807,6 +850,7 @@ { "cell_type": "code", "execution_count": 118, + "id": "3f0d7e8f", "metadata": {}, "outputs": [ { @@ -861,6 +905,7 @@ { "cell_type": "code", "execution_count": 120, + "id": "7ce86172", "metadata": {}, "outputs": [ { @@ -894,6 +939,7 @@ }, { "cell_type": "markdown", + "id": "15354468", "metadata": {}, "source": [ "### SSN regex matching constraint" @@ -902,6 +948,7 @@ { "cell_type": "code", "execution_count": 121, + "id": "6e475de8", "metadata": {}, "outputs": [], "source": [ @@ -911,6 +958,7 @@ { "cell_type": "code", "execution_count": 123, + "id": "9d956856", "metadata": {}, "outputs": [ { @@ -953,6 +1001,7 @@ { "cell_type": "code", "execution_count": 125, + "id": "d7cf8fe8", "metadata": {}, "outputs": [ { @@ -986,6 +1035,7 @@ }, { "cell_type": "markdown", + "id": "514241b0", "metadata": {}, "source": [ "### URL regex matching constraint" @@ -994,6 +1044,7 @@ { "cell_type": "code", "execution_count": 127, + "id": "9b6b8257", "metadata": {}, "outputs": [], "source": [ @@ -1003,6 +1054,7 @@ { "cell_type": "code", "execution_count": 129, + "id": "52460643", "metadata": {}, "outputs": [ { @@ -1047,6 +1099,7 @@ { "cell_type": "code", "execution_count": 131, + "id": "d1bfc094", "metadata": {}, "outputs": [ { @@ -1080,6 +1133,7 @@ }, { "cell_type": "markdown", + "id": "eff98762", "metadata": {}, "source": [ "# Datetime/json constraints" @@ -1088,6 +1142,7 @@ { "cell_type": "code", "execution_count": 10, + "id": "a9fdc2ef", "metadata": {}, "outputs": [ { @@ -1152,6 +1207,7 @@ }, { "cell_type": "markdown", + "id": "16af2279", "metadata": {}, "source": [ "Seeing the comments above, when creating the dataset, we can realize which values fail or pass, for which constraint. The dateutil constraint has 5 passing values in the dataset, and the other 3 constraints have only 2 values that pass from total of 14." @@ -1159,6 +1215,7 @@ }, { "cell_type": "markdown", + "id": "dd89c8d7", "metadata": {}, "source": [ "# Entropy and Distributional Measures" @@ -1166,6 +1223,7 @@ }, { "cell_type": "markdown", + "id": "b31901ff", "metadata": {}, "source": [ "### Entropy" @@ -1173,6 +1231,7 @@ }, { "cell_type": "markdown", + "id": "426903d3", "metadata": {}, "source": [ "Check if the colmn entropy is in some interval [a, b]. Works both for discrete and continuous valued columns." @@ -1181,6 +1240,7 @@ { "cell_type": "code", "execution_count": 5, + "id": "a879dea4", "metadata": {}, "outputs": [], "source": [ @@ -1189,6 +1249,7 @@ }, { "cell_type": "markdown", + "id": "0d102965", "metadata": {}, "source": [ "#### Entropy on categorical data" @@ -1196,7 +1257,8 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, + "id": "daa642d5", "metadata": {}, "outputs": [], "source": [ @@ -1209,6 +1271,7 @@ { "cell_type": "code", "execution_count": 19, + "id": "c4cb1421", "metadata": {}, "outputs": [ { @@ -1236,6 +1299,7 @@ }, { "cell_type": "markdown", + "id": "5b8171ca", "metadata": {}, "source": [ "#### Entropy on continuous data" @@ -1244,6 +1308,7 @@ { "cell_type": "code", "execution_count": 26, + "id": "55e59691", "metadata": {}, "outputs": [], "source": [ @@ -1258,6 +1323,7 @@ { "cell_type": "code", "execution_count": 50, + "id": "931a2585", "metadata": {}, "outputs": [ { @@ -1285,6 +1351,7 @@ }, { "cell_type": "markdown", + "id": "020e9ae2", "metadata": {}, "source": [ "### KS Test" @@ -1292,6 +1359,7 @@ }, { "cell_type": "markdown", + "id": "c3b4dcd2", "metadata": {}, "source": [ "The KS Test can only be executed on continuous data." @@ -1300,6 +1368,7 @@ { "cell_type": "code", "execution_count": 8, + "id": "c53fdd26", "metadata": {}, "outputs": [], "source": [ @@ -1309,6 +1378,7 @@ { "cell_type": "code", "execution_count": 11, + "id": "48251fb0", "metadata": {}, "outputs": [], "source": [ @@ -1322,6 +1392,7 @@ { "cell_type": "code", "execution_count": 12, + "id": "2d265980", "metadata": {}, "outputs": [ { @@ -1354,6 +1425,7 @@ }, { "cell_type": "markdown", + "id": "1958febe", "metadata": {}, "source": [ "The p-value is less than 0.05, which means we can reject the null hypothesis with this confidence level." @@ -1361,6 +1433,7 @@ }, { "cell_type": "markdown", + "id": "9ff32245", "metadata": {}, "source": [ "### KL Divergence" @@ -1368,6 +1441,7 @@ }, { "cell_type": "markdown", + "id": "a3d9344e", "metadata": {}, "source": [ "The KL Divergence constraint is supported for both discrete and continuous variables." @@ -1376,6 +1450,7 @@ { "cell_type": "code", "execution_count": 13, + "id": "c729bb38", "metadata": {}, "outputs": [], "source": [ @@ -1384,6 +1459,7 @@ }, { "cell_type": "markdown", + "id": "2300513e", "metadata": {}, "source": [ "#### KL Divergence for continuous case" @@ -1392,6 +1468,7 @@ { "cell_type": "code", "execution_count": 14, + "id": "2d865e39", "metadata": {}, "outputs": [ { @@ -1433,6 +1510,7 @@ }, { "cell_type": "markdown", + "id": "55ba531d", "metadata": {}, "source": [ "The distribution of sales in 2020 cannot be encoded with the distribution of sales in 2021." @@ -1440,6 +1518,7 @@ }, { "cell_type": "markdown", + "id": "4ec0d8f1", "metadata": {}, "source": [ "#### KL Divergence for discrete case" @@ -1448,6 +1527,7 @@ { "cell_type": "code", "execution_count": 18, + "id": "5e22e532", "metadata": {}, "outputs": [ { @@ -1471,13 +1551,14 @@ "dc = DatasetConstraints(None, summary_constraints={\"pet\": [kl_divergence_greater_than]})\n", "\n", "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", - "# now we expect the constraint to fail\n", + "# now we expect the constraint to not fail\n", "report = profile.apply_summary_constraints()\n", "format_report(report)" ] }, { "cell_type": "markdown", + "id": "ba3abc84", "metadata": {}, "source": [ "### Chi-Squared Test" @@ -1485,6 +1566,7 @@ }, { "cell_type": "markdown", + "id": "ef4c2790", "metadata": {}, "source": [ "The Chi-Squared test constraint is only supported for categorical values." @@ -1492,7 +1574,8 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 5, + "id": "efa2de2a", "metadata": {}, "outputs": [], "source": [ @@ -1502,6 +1585,7 @@ { "cell_type": "code", "execution_count": 21, + "id": "cbde8291", "metadata": {}, "outputs": [ { @@ -1525,13 +1609,67 @@ "dc = DatasetConstraints(None, summary_constraints={\"pet\": [chi_squared_p_value_greater_than]})\n", "\n", "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", - "# now we expect the constraint to fail\n", + "# now we expect the constraint to not fail\n", + "report = profile.apply_summary_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "14fa9046", + "metadata": {}, + "source": [ + "The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval." + ] + }, + { + "cell_type": "markdown", + "id": "0b589597", + "metadata": {}, + "source": [ + "If you don't have a reference distribution for calculating the Chi-Squared Test, but you know the approximate frequencies of each of the items, you can use this constraint by supplying a mapping of items and frequencies as counts, in the reference distribution parameter of the constraint." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b4f105f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Constraint failures by feature - \n", + "pet:\n", + " test_name total_run failed\n", + " summary chi_squared_test p-value GT 0.05 1 0\n" + ] + } + ], + "source": [ + "# create a new distribtution from the pets sample with different probabilities\n", + "reference_dict_pets = {\n", + " 'cat': 30,\n", + " 'dog': 10,\n", + " 'rabbit': 5, \n", + " 'hamster': 5,\n", + "}\n", + "\n", + "# check if the kl divergence is greater than 0.6 \n", + "chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=reference_dict_pets, p_value=0.05)\n", + "\n", + "dc = DatasetConstraints(None, summary_constraints={\"pet\": [chi_squared_p_value_greater_than]})\n", + "\n", + "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", + "# now we expect the constraint to not fail since this is approximately the same distribution from the previous example\n", "report = profile.apply_summary_constraints()\n", "format_report(report)" ] }, { "cell_type": "markdown", + "id": "78607967", "metadata": {}, "source": [ "The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval." @@ -1539,6 +1677,7 @@ }, { "cell_type": "markdown", + "id": "a4113311", "metadata": {}, "source": [ "## Table shape constraints" @@ -1547,6 +1686,7 @@ { "cell_type": "code", "execution_count": 6, + "id": "cceadc2d", "metadata": {}, "outputs": [ { @@ -1595,6 +1735,7 @@ }, { "cell_type": "markdown", + "id": "1956cf2b", "metadata": {}, "source": [ "## Multi column constraints\n", @@ -1604,6 +1745,7 @@ { "cell_type": "code", "execution_count": 36, + "id": "0ae20c1d", "metadata": {}, "outputs": [ { @@ -1635,6 +1777,7 @@ }, { "cell_type": "markdown", + "id": "76062557", "metadata": {}, "source": [ "Value by value comparison. col1 values > col2 values, only 2 are passing, and col1 values == col 2 values only 1 is True (the third element from both the columns are equal)." @@ -1642,6 +1785,7 @@ }, { "cell_type": "markdown", + "id": "f744614c", "metadata": {}, "source": [ "### Sum of row values of multiple columns equals some value, or some column value" @@ -1650,6 +1794,7 @@ { "cell_type": "code", "execution_count": 6, + "id": "ae986496", "metadata": {}, "outputs": [], "source": [ @@ -1659,6 +1804,7 @@ { "cell_type": "code", "execution_count": 37, + "id": "b5a8a4a2", "metadata": {}, "outputs": [], "source": [ @@ -1673,6 +1819,7 @@ { "cell_type": "code", "execution_count": 38, + "id": "62430c24", "metadata": {}, "outputs": [ { @@ -1711,6 +1858,7 @@ { "cell_type": "code", "execution_count": 39, + "id": "9d1b7812", "metadata": {}, "outputs": [ { @@ -1740,6 +1888,7 @@ }, { "cell_type": "markdown", + "id": "bbbc748f", "metadata": {}, "source": [ "### Column Pair Values in Set" @@ -1747,6 +1896,7 @@ }, { "cell_type": "markdown", + "id": "fa997a38", "metadata": {}, "source": [ "Check if the values of a pair of columns are in a predefined set of pair values." @@ -1755,6 +1905,7 @@ { "cell_type": "code", "execution_count": 40, + "id": "c7cbd1e0", "metadata": {}, "outputs": [], "source": [ @@ -1764,6 +1915,7 @@ { "cell_type": "code", "execution_count": 41, + "id": "9c679454", "metadata": {}, "outputs": [], "source": [ @@ -1777,6 +1929,7 @@ { "cell_type": "code", "execution_count": 42, + "id": "b6db3720", "metadata": {}, "outputs": [ { @@ -1812,6 +1965,7 @@ }, { "cell_type": "markdown", + "id": "72a21ad0", "metadata": {}, "source": [ "### Column Values Unique within Row" @@ -1819,6 +1973,7 @@ }, { "cell_type": "markdown", + "id": "cf52c403", "metadata": {}, "source": [ "Check if the value of the specified column is unique within each row." @@ -1827,6 +1982,7 @@ { "cell_type": "code", "execution_count": 43, + "id": "42ef3e02", "metadata": {}, "outputs": [], "source": [ @@ -1836,6 +1992,7 @@ { "cell_type": "code", "execution_count": 45, + "id": "db57da2c", "metadata": {}, "outputs": [], "source": [ @@ -1850,6 +2007,7 @@ { "cell_type": "code", "execution_count": 47, + "id": "d78fc0a4", "metadata": {}, "outputs": [ { @@ -1892,7 +2050,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.8.8" } }, "nbformat": 4, From 88cdfeb6e7ca8cc7057be448b06370fc90b70303 Mon Sep 17 00:00:00 2001 From: MilenaTrajanoska Date: Mon, 17 Jan 2022 16:14:32 +0100 Subject: [PATCH 03/10] Change values in some examples with distributional measures --- examples/Constraints2.ipynb | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/Constraints2.ipynb b/examples/Constraints2.ipynb index 5bef3dba9e..c4a905e60d 100644 --- a/examples/Constraints2.ipynb +++ b/examples/Constraints2.ipynb @@ -1270,7 +1270,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 7, "id": "c4cb1421", "metadata": {}, "outputs": [ @@ -1281,13 +1281,13 @@ "Constraint failures by feature - \n", "pet:\n", " test_name total_run failed\n", - " summary entropy BTWN 0.7 and 1.9 1 0\n" + " summary entropy BTWN 0.7 and 2.1 1 0\n" ] } ], "source": [ - "# check if the entropy of the pet_df 'pet' column is between 0.7 and 1.9 (the actual value is 1.85)\n", - "entropy_between_values_constraint = approximateEntropyBetweenConstraint(lower_value=0.7, upper_value=1.9)\n", + "# check if the entropy of the pet_df 'pet' column is between 0.7 and 2.1 (the actual value is around 1.85)\n", + "entropy_between_values_constraint = approximateEntropyBetweenConstraint(lower_value=0.7, upper_value=2.1)\n", "\n", "dc = DatasetConstraints(None, summary_constraints={\"pet\": [entropy_between_values_constraint]})\n", "\n", @@ -1574,7 +1574,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "efa2de2a", "metadata": {}, "outputs": [], @@ -1584,7 +1584,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 40, "id": "cbde8291", "metadata": {}, "outputs": [ @@ -1595,21 +1595,21 @@ "Constraint failures by feature - \n", "pet:\n", " test_name total_run failed\n", - " summary chi_squared_test p-value GT 0.05 1 0\n" + " summary chi_squared_test p-value GT 0.05 1 1\n" ] } ], "source": [ "# create a new distribtution from the pets sample with different probabilities\n", - "pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.6, 0.2, 0.1, 0.1])\n", + "pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.01, 0.01, 0.97, 0.01])\n", "\n", - "# check if the kl divergence is greater than 0.6 \n", + "# check if the p-value is greater than 0.05\n", "chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=pets_reference, p_value=0.05)\n", "\n", "dc = DatasetConstraints(None, summary_constraints={\"pet\": [chi_squared_p_value_greater_than]})\n", "\n", "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", - "# now we expect the constraint to not fail\n", + "# now we expect the constraint to fail since the distributions are different\n", "report = profile.apply_summary_constraints()\n", "format_report(report)" ] @@ -1627,12 +1627,12 @@ "id": "0b589597", "metadata": {}, "source": [ - "If you don't have a reference distribution for calculating the Chi-Squared Test, but you know the approximate frequencies of each of the items, you can use this constraint by supplying a mapping of items and frequencies as counts, in the reference distribution parameter of the constraint." + "If you don't have a reference distribution for calculating the Chi-Squared test, but you know the approximate frequencies of each of the items, you can use this constraint by supplying a mapping of items and frequencies as counts, in the reference distribution parameter of the constraint." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 32, "id": "b4f105f1", "metadata": {}, "outputs": [ @@ -1643,26 +1643,26 @@ "Constraint failures by feature - \n", "pet:\n", " test_name total_run failed\n", - " summary chi_squared_test p-value GT 0.05 1 0\n" + " summary chi_squared_test p-value GT 0.05 1 1\n" ] } ], "source": [ "# create a new distribtution from the pets sample with different probabilities\n", "reference_dict_pets = {\n", - " 'cat': 30,\n", - " 'dog': 10,\n", - " 'rabbit': 5, \n", - " 'hamster': 5,\n", + " 'cat': 1,\n", + " 'dog': 1,\n", + " 'rabbit': 48, \n", + " 'hamster': 1,\n", "}\n", "\n", - "# check if the kl divergence is greater than 0.6 \n", + "# check if the p_value is greater than 0.05\n", "chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=reference_dict_pets, p_value=0.05)\n", "\n", "dc = DatasetConstraints(None, summary_constraints={\"pet\": [chi_squared_p_value_greater_than]})\n", "\n", "profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)\n", - "# now we expect the constraint to not fail since this is approximately the same distribution from the previous example\n", + "# now we expect the constraint to fail since this is approximately the same distribution from the previous example\n", "report = profile.apply_summary_constraints()\n", "format_report(report)" ] From 21af83369076712bd497d80864ebbceba3ea627e Mon Sep 17 00:00:00 2001 From: MilenaTrajanoska Date: Mon, 24 Jan 2022 17:33:56 +0100 Subject: [PATCH 04/10] Generate constraints possible constraints --- src/whylogs/core/columnprofile.py | 33 ++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/whylogs/core/columnprofile.py b/src/whylogs/core/columnprofile.py index c6ffe44c3b..3fc3ef0524 100644 --- a/src/whylogs/core/columnprofile.py +++ b/src/whylogs/core/columnprofile.py @@ -12,6 +12,10 @@ SummaryConstraint, SummaryConstraints, ValueConstraints, + maxLessThanEqualConstraint, + meanBetweenConstraint, + columnValuesTypeEqualsConstraint, + columnUniqueValueCountBetweenConstraint, minGreaterThanEqualConstraint, columnMostCommonValueInSetConstraint, ) from whylogs.core.statistics.hllsketch import HllSketch from whylogs.core.types import TypedDataConverter @@ -180,9 +184,32 @@ def generate_constraints(self) -> SummaryConstraints: items = [] if self.number_tracker is not None and self.number_tracker.count > 0: summ = self.number_tracker.to_summary() - if summ.min > 0: - items = [SummaryConstraint(op=Op.GT, first_field="min", value=0)] - # generate additional constraints here + + if summ.min >= 0: + items.append(minGreaterThanEqualConstraint(value=0)) + items.append(meanBetweenConstraint( + lower_value=summ.mean - summ.stddev, + upper_value=summ.mean + summ.stddev, + )) + if summ.max <= 0: + items.append(maxLessThanEqualConstraint(value=0)) + schema_summary = self.schema_tracker.to_summary() + if schema_summary.inferred_type not in (InferredType.UNKNOWN, InferredType.NULL): + items.append(columnValuesTypeEqualsConstraint(expected_type=schema_summary.inferred_type)) + + if self.cardinality_tracker: + unique_count = self.cardinality_tracker.to_summary() + if unique_count.estimate > 0: + items.append(columnUniqueValueCountBetweenConstraint( + lower_value=unique_count.lower, + upper_value=unique_count.upper, + )) + + frequent_items_summary = self.frequent_items.to_summary(max_items=5) + if len(frequent_items_summary) > 0: + most_common_value_set = {val.json_value for val in frequent_items_summary} + items.append(columnMostCommonValueInSetConstraint(value_set=most_common_value_set)) + if len(items) > 0: return SummaryConstraints(items) From eb06acebb3c116cf1f0f5671f93f36e8aeffba12 Mon Sep 17 00:00:00 2001 From: MilenaTrajanoska Date: Tue, 25 Jan 2022 13:24:18 +0100 Subject: [PATCH 05/10] Add tests for generate_constraints, example for generate_constraints in Constraints_Suite.ipynb --- ...traints2.ipynb => Constraints_Suite.ipynb} | 334 +++++++++++++++++- src/whylogs/core/columnprofile.py | 69 ++-- .../unit/core/statistics/test_constraints.py | 156 +++++++- 3 files changed, 515 insertions(+), 44 deletions(-) rename examples/{Constraints2.ipynb => Constraints_Suite.ipynb} (84%) diff --git a/examples/Constraints2.ipynb b/examples/Constraints_Suite.ipynb similarity index 84% rename from examples/Constraints2.ipynb rename to examples/Constraints_Suite.ipynb index c4a905e60d..9d87e1e485 100644 --- a/examples/Constraints2.ipynb +++ b/examples/Constraints_Suite.ipynb @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "bfd0b5a7", "metadata": {}, "outputs": [], @@ -1991,7 +1991,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 6, "id": "db57da2c", "metadata": {}, "outputs": [], @@ -2001,6 +2001,8 @@ " \"last_name\": [\"Doe\", \"Doe\", \"Smith\", \"Jones\"],\n", " \"username\": [\"jd123\", \"jane.doe@example.com\", \"bobsmith\", \"_anna_\"],\n", " \"email\": [\"john.doe@example.com\", \"jane.doe@example.com\", \"bob.smith@example.com\", \"anna_jones@example.com\"],\n", + " \"followers\": [1525, 12268, 51343, 867],\n", + " \"points\": [23.4, 123.2, 432.22, 32.1],\n", "})" ] }, @@ -2032,6 +2034,334 @@ "# we expect 1 out of 4 evaluations of the constraint to be a failure, sicne Jane Doe's email is the same as their username\n", "format_report(dc.report())" ] + }, + { + "cell_type": "markdown", + "id": "a7da9d24", + "metadata": {}, + "source": [ + "# Generate default constraints for data set" + ] + }, + { + "cell_type": "markdown", + "id": "a88c6ede", + "metadata": {}, + "source": [ + "Let's log the users data frame from the previous example, without any constraints. We will use WhyLogs' generate_constraints method to generate default constraints using the dataset profile." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b938730f", + "metadata": {}, + "outputs": [], + "source": [ + "profile = session.log_dataframe(users, \"test.data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a552cdce", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"properties\": {\n", + " \"schemaMajorVersion\": 1,\n", + " \"schemaMinorVersion\": 2,\n", + " \"sessionId\": \"8222b610-9472-4bfb-92f5-a56a49cd8199\",\n", + " \"sessionTimestamp\": \"1643116248232\",\n", + " \"dataTimestamp\": \"1643112751681\",\n", + " \"tags\": {\n", + " \"name\": \"test.data\"\n", + " },\n", + " \"metadata\": {}\n", + " },\n", + " \"summaryConstraints\": {\n", + " \"first_name\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'Bob', 'Anna', 'John', 'Jane'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"Bob\",\n", + " \"Anna\",\n", + " \"John\",\n", + " \"Jane\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"followers\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary min GE 0/None\",\n", + " \"firstField\": \"min\",\n", + " \"value\": 0.0,\n", + " \"op\": \"GE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary mean BTWN -7308.11238882488 and 40309.612388824884\",\n", + " \"firstField\": \"mean\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": -7308.11238882488,\n", + " \"upperValue\": 40309.612388824884\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary column_values_type EQ INTEGRAL\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 3.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'51343', '867', '1525', '12268'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"51343\",\n", + " \"867\",\n", + " \"1525\",\n", + " \"12268\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"last_name\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 2 and 4\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 2.0,\n", + " \"upperValue\": 4.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'Jones', 'Doe', 'Smith'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"Jones\",\n", + " \"Doe\",\n", + " \"Smith\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"email\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'bob.smith@example.com', 'john.doe@example.com', 'jane.doe@example.com', 'anna_jones@example.com'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"bob.smith@example.com\",\n", + " \"john.doe@example.com\",\n", + " \"jane.doe@example.com\",\n", + " \"anna_jones@example.com\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"points\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary min GE 0/None\",\n", + " \"firstField\": \"min\",\n", + " \"value\": 0.0,\n", + " \"op\": \"GE\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary mean BTWN -38.98552432358383 and 344.44552432358387\",\n", + " \"firstField\": \"mean\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": -38.98552432358383,\n", + " \"upperValue\": 344.44552432358387\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary column_values_type EQ FRACTIONAL\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 2.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'123.2', '432.22', '32.1', '23.4'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"123.2\",\n", + " \"432.22\",\n", + " \"32.1\",\n", + " \"23.4\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " },\n", + " \"username\": {\n", + " \"constraints\": [\n", + " {\n", + " \"name\": \"summary column_values_type EQ STRING\",\n", + " \"firstField\": \"column_values_type\",\n", + " \"value\": 5.0,\n", + " \"op\": \"EQ\",\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary unique_count BTWN 3 and 5\",\n", + " \"firstField\": \"unique_count\",\n", + " \"op\": \"BTWN\",\n", + " \"between\": {\n", + " \"lowerValue\": 3.0,\n", + " \"upperValue\": 5.0\n", + " },\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " },\n", + " {\n", + " \"name\": \"summary most_common_value IN {'jd123', 'bobsmith', '_anna_', 'jane.doe@example.com'}\",\n", + " \"firstField\": \"most_common_value\",\n", + " \"op\": \"IN\",\n", + " \"referenceSet\": [\n", + " \"jd123\",\n", + " \"bobsmith\",\n", + " \"_anna_\",\n", + " \"jane.doe@example.com\"\n", + " ],\n", + " \"verbose\": false,\n", + " \"quantileValue\": 0.0\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " \"valueConstraints\": {}\n", + "}\n" + ] + } + ], + "source": [ + "auto_constraints = profile.generate_constraints()\n", + "print(message_to_json(auto_constraints.to_protobuf()))" + ] + }, + { + "cell_type": "markdown", + "id": "77ea23ed", + "metadata": {}, + "source": [ + "For the columns with inferred type STRING, the generate constraints method generates 3 types of constraints: columnValuesTypeEqualsConstraint where the type is STRING, columnUniqueValueCountBetweenConstraint which makes a constraint that the unique values in a column should range between unique_count - 1 and unique_count + 1 in the current data frame, and finally columnMostCommonValueInSetConstraint which takes a set of the 5 most common values and defines a constraint that the most common value in this column should be in that set." + ] + }, + { + "cell_type": "markdown", + "id": "3f683579", + "metadata": {}, + "source": [ + "The columns which have inferred type FRACTIONAL or INTEGRAL, such as 'points' and 'followers' respectively, have numeric constraints generated such as minimum value greater than 0, maximum value less than 0, mean in range [mean - stddev, mean + stddev], if these constraints apply to the current column. Apart from these constraints, columnValuesTypeEqualsConstraint and columnMostCommonValueInSetConstraint are generated for both types. columnUniqueValueCountBetweenConstraint is generated only for the INTEGRAL valued columns." + ] + }, + { + "cell_type": "markdown", + "id": "5cd56524", + "metadata": {}, + "source": [ + "No constraints are generated for columns which have an inferred type of NULL." + ] } ], "metadata": { diff --git a/src/whylogs/core/columnprofile.py b/src/whylogs/core/columnprofile.py index 3fc3ef0524..d4cc08e862 100644 --- a/src/whylogs/core/columnprofile.py +++ b/src/whylogs/core/columnprofile.py @@ -9,23 +9,18 @@ StringTracker, ) from whylogs.core.statistics.constraints import ( - SummaryConstraint, SummaryConstraints, ValueConstraints, + columnMostCommonValueInSetConstraint, + columnUniqueValueCountBetweenConstraint, + columnValuesTypeEqualsConstraint, maxLessThanEqualConstraint, meanBetweenConstraint, - columnValuesTypeEqualsConstraint, - columnUniqueValueCountBetweenConstraint, minGreaterThanEqualConstraint, columnMostCommonValueInSetConstraint, + minGreaterThanEqualConstraint, ) from whylogs.core.statistics.hllsketch import HllSketch from whylogs.core.types import TypedDataConverter -from whylogs.proto import ( - ColumnMessage, - ColumnSummary, - InferredType, - Op, - UniqueCountSummary, -) +from whylogs.proto import ColumnMessage, ColumnSummary, InferredType, UniqueCountSummary from whylogs.util.dsketch import FrequentItemsSketch _TYPES = InferredType.Type @@ -187,31 +182,39 @@ def generate_constraints(self) -> SummaryConstraints: if summ.min >= 0: items.append(minGreaterThanEqualConstraint(value=0)) - items.append(meanBetweenConstraint( - lower_value=summ.mean - summ.stddev, - upper_value=summ.mean + summ.stddev, - )) + items.append( + meanBetweenConstraint( + lower_value=summ.mean - summ.stddev, + upper_value=summ.mean + summ.stddev, + ) + ) if summ.max <= 0: items.append(maxLessThanEqualConstraint(value=0)) - schema_summary = self.schema_tracker.to_summary() - if schema_summary.inferred_type not in (InferredType.UNKNOWN, InferredType.NULL): - items.append(columnValuesTypeEqualsConstraint(expected_type=schema_summary.inferred_type)) - - if self.cardinality_tracker: - unique_count = self.cardinality_tracker.to_summary() - if unique_count.estimate > 0: - items.append(columnUniqueValueCountBetweenConstraint( - lower_value=unique_count.lower, - upper_value=unique_count.upper, - )) - - frequent_items_summary = self.frequent_items.to_summary(max_items=5) - if len(frequent_items_summary) > 0: - most_common_value_set = {val.json_value for val in frequent_items_summary} - items.append(columnMostCommonValueInSetConstraint(value_set=most_common_value_set)) - - if len(items) > 0: - return SummaryConstraints(items) + + schema_summary = self.schema_tracker.to_summary() + inferred_type = schema_summary.inferred_type.type + if inferred_type not in (InferredType.UNKNOWN, InferredType.NULL): + items.append(columnValuesTypeEqualsConstraint(expected_type=inferred_type)) + + if self.cardinality_tracker and inferred_type != InferredType.FRACTIONAL: + unique_count = self.cardinality_tracker.to_summary() + if unique_count and unique_count.estimate > 0: + low = int(max(0, unique_count.lower - 1)) + up = int(unique_count.upper + 1) + items.append( + columnUniqueValueCountBetweenConstraint( + lower_value=low, + upper_value=up, + ) + ) + + frequent_items_summary = self.frequent_items.to_summary(max_items=5) + if frequent_items_summary and len(frequent_items_summary.items) > 0: + most_common_value_set = {val.json_value for val in frequent_items_summary.items} + items.append(columnMostCommonValueInSetConstraint(value_set=most_common_value_set)) + + if len(items) > 0: + return SummaryConstraints(items) return None diff --git a/tests/unit/core/statistics/test_constraints.py b/tests/unit/core/statistics/test_constraints.py index b0952eeaff..25b37aa022 100644 --- a/tests/unit/core/statistics/test_constraints.py +++ b/tests/unit/core/statistics/test_constraints.py @@ -56,7 +56,6 @@ def test_value_summary_serialization(): - for each_op, _ in _value_funcs.items(): if each_op == Op.APPLY_FUNC: continue @@ -91,7 +90,6 @@ def test_value_summary_serialization(): def test_value_constraints(df_lending_club, local_config_path): - conforming_loan = ValueConstraint(Op.LT, 548250) smallest_loan = ValueConstraint(Op.GT, 2500.0, verbose=True) @@ -116,7 +114,6 @@ def test_value_constraints(df_lending_club, local_config_path): def test_value_constraints_pattern_match(df_lending_club, local_config_path): - regex_state_abbreviation = r"^[a-zA-Z]{2}$" contains_state = ValueConstraint(Op.MATCH, regex_pattern=regex_state_abbreviation) @@ -290,7 +287,6 @@ def test_value_constraints_raw_and_coerced_types_report(): def test_summary_between_serialization_deserialization(): - # constraints may have an optional name sum_constraint = SummaryConstraint("min", Op.BTWN, 0.1, 2.4) msg_sum_const = sum_constraint.to_protobuf() @@ -365,7 +361,6 @@ def test_summary_between_constraints_fields(df_lending_club, local_config_path): def test_summary_between_constraints_no_merge_different_values_fields(): - std_dev_between1 = SummaryConstraint("stddev", Op.BTWN, value=0.1, upper_value=200) std_dev_between2 = SummaryConstraint("stddev", Op.BTWN, value=0.2, upper_value=200) @@ -521,7 +516,6 @@ def test_max_between_constraint_invalid(): def _apply_summary_constraints_on_dataset(df_lending_club, local_config_path, summary_constraints): - dc = DatasetConstraints(None, summary_constraints=summary_constraints) config = load_config(local_config_path) session = session_from_config(config) @@ -576,7 +570,6 @@ def test_set_summary_constraint_invalid_init(): def test_set_summary_no_merge_different_set(): - set_c_1 = SummaryConstraint("distinct_column_values", Op.CONTAIN_SET, reference_set=[1, 2, 3]) set_c_2 = SummaryConstraint("distinct_column_values", Op.CONTAIN_SET, reference_set=[2, 3, 4, 5]) with pytest.raises(AssertionError): @@ -718,7 +711,6 @@ def _apply_string_length_constraints(local_config_path, length_constraints): def test_string_length_constraints(local_config_path): - length_constraint7 = stringLengthEqualConstraint(length=7) length_constraint24 = stringLengthEqualConstraint(length=24) length_constraint7to10 = stringLengthBetweenConstraint(lower_value=7, upper_value=10) @@ -884,7 +876,6 @@ def _apply_apply_func_constraints(local_config_path, apply_func_constraints): def test_apply_func_value_constraints(local_config_path): - dateutil_parseable = dateUtilParseableConstraint() json_parseable = jsonParseableConstraint() @@ -1870,3 +1861,150 @@ def test_chi_squared_test_p_value_greater_than_constraint_wrong_datatype(): columnChiSquaredTestPValueGreaterThanConstraint({"A": 0.3, "B": 1, "C": 12}, p_value=0.2, verbose=True) with pytest.raises(TypeError): columnChiSquaredTestPValueGreaterThanConstraint(["a", "b", "c"], p_value=1.2, verbose=True) + + +def test_generate_default_constraints_categorical(local_config_path): + usernames = ["jd123", "jane.doe@example.com", "bobsmith", "_anna_"] + emails = ["john.doe@example.com", "jane.doe@example.com", "bob.smith@example.com", "anna_jones@example.com"] + data = pd.DataFrame( + { + "username": usernames, + "email": emails, + } + ) + config = load_config(local_config_path) + session = session_from_config(config) + profile = session.log_dataframe(data, "test.data") + generated_constraints = profile.generate_constraints() + + json_summ = json.loads(message_to_json(generated_constraints.to_protobuf())) + constraints_username = json_summ["summaryConstraints"]["username"]["constraints"] + constraints_email = json_summ["summaryConstraints"]["email"]["constraints"] + + # username constraints + assert len(constraints_username) == 3 # column value type equals, unique count between and most common value in set + assert constraints_username[0]["name"] == "summary column_values_type EQ STRING" + assert constraints_username[0]["firstField"] == "column_values_type" + assert constraints_username[0]["value"] == InferredType.STRING + assert constraints_username[0]["op"] == Op.Name(Op.EQ) + assert constraints_username[0]["verbose"] is False + + # there are 4 unique values in the df for username, so the unique count between is in the range 4-1 and 4+1 + assert constraints_username[1]["name"] == "summary unique_count BTWN 3 and 5" + assert constraints_username[1]["firstField"] == "unique_count" + assert constraints_username[1]["op"] == Op.Name(Op.BTWN) + assert pytest.approx(constraints_username[1]["between"]["lowerValue"], 0.001) == 3 + assert pytest.approx(constraints_username[1]["between"]["upperValue"], 0.001) == 5 + assert constraints_username[1]["verbose"] is False + + assert f"summary most_common_value IN" in constraints_username[2]["name"] # set has different order + assert constraints_username[2]["firstField"] == "most_common_value" + assert constraints_username[2]["op"] == Op.Name(Op.IN) + assert set(constraints_username[2]["referenceSet"]) == set(usernames) + assert constraints_username[2]["verbose"] is False + + # email constraints + assert len(constraints_email) == 3 # column value type equals, unique count between and most common value in set + assert constraints_email[0]["name"] == "summary column_values_type EQ STRING" + assert constraints_email[0]["firstField"] == "column_values_type" + assert constraints_email[0]["value"] == InferredType.STRING + assert constraints_email[0]["op"] == Op.Name(Op.EQ) + assert constraints_email[0]["verbose"] is False + + # there are 4 unique values in the df for username, so the unique count between is in the range 4-1 and 4+1 + assert constraints_email[1]["name"] == "summary unique_count BTWN 3 and 5" + assert constraints_email[1]["firstField"] == "unique_count" + assert constraints_email[1]["op"] == Op.Name(Op.BTWN) + assert pytest.approx(constraints_email[1]["between"]["lowerValue"], 0.001) == 3 + assert pytest.approx(constraints_email[1]["between"]["upperValue"], 0.001) == 5 + assert constraints_email[1]["verbose"] is False + + assert f"summary most_common_value IN" in constraints_email[2]["name"] # set has different order + assert constraints_email[2]["firstField"] == "most_common_value" + assert constraints_email[2]["op"] == Op.Name(Op.IN) + assert set(constraints_email[2]["referenceSet"]) == set(emails) + assert constraints_email[2]["verbose"] is False + + +def test_generate_default_constraints_numeric(local_config_path): + data = pd.DataFrame( + { + "followers": [1525, 12268, 51343, 867, 567, 100265, 22113, 3412], + "points": [23.4, 123.2, 432.22, 32.1, 44.1, 42.2, 344.2, 42.1], + } + ) + + config = load_config(local_config_path) + session = session_from_config(config) + profile = session.log_dataframe(data, "test.data") + generated_constraints = profile.generate_constraints() + + json_summ = json.loads(message_to_json(generated_constraints.to_protobuf())) + followers_constraints = json_summ["summaryConstraints"]["followers"]["constraints"] + points_constraints = json_summ["summaryConstraints"]["points"]["constraints"] + + assert len(followers_constraints) == 5 + # min greater than 0, mean between mean-stddev and mean+stddev, + # column values type, most common value in set, unique count between + + followers_mean = data["followers"].mean() + followers_stddev = data["followers"].std() + lower_followers = followers_mean - followers_stddev + upper_followers = followers_mean + followers_stddev + + assert followers_constraints[0]["name"] == "summary min GE 0/None" + assert followers_constraints[1]["name"] == f"summary mean BTWN {lower_followers} and {upper_followers}" + assert followers_constraints[2]["name"] == "summary column_values_type EQ INTEGRAL" + assert followers_constraints[3]["name"] == "summary unique_count BTWN 7 and 9" # we have 8 unique values in the df + assert "summary most_common_value IN" in followers_constraints[4]["name"] + + assert len(points_constraints) == 4 + # min greater than 0, mean between mean-stddev and mean+stddev, + # column values type, most common value in set + points_mean = data["points"].mean() + points_stddev = data["points"].std() + lower_points = points_mean - points_stddev + upper_points = points_mean + points_stddev + + assert points_constraints[0]["name"] == "summary min GE 0/None" + assert points_constraints[1]["name"] == f"summary mean BTWN {lower_points} and {upper_points}" + assert points_constraints[2]["name"] == "summary column_values_type EQ FRACTIONAL" + assert "summary most_common_value IN" in points_constraints[3]["name"] + + +def test_generate_default_constraints_mixed(local_config_path): + data = pd.DataFrame( + {"username": ["jd123", "jane.doe@example.com", "bobsmith", "_anna_"], "followers": [1525, 12268, 51343, 867], "null": [None, None, None, None]} + ) + + config = load_config(local_config_path) + session = session_from_config(config) + profile = session.log_dataframe(data, "test.data") + generated_constraints = profile.generate_constraints() + + json_summ = json.loads(message_to_json(generated_constraints.to_protobuf())) + username_constraints = json_summ["summaryConstraints"]["username"]["constraints"] + followers_constraints = json_summ["summaryConstraints"]["followers"]["constraints"] + + # no constraints should be generated for the null column since all values are None + assert "null" not in json_summ["summaryConstraints"] + + assert len(username_constraints) == 3 # column value type equals, unique count between and most common value in set + assert username_constraints[0]["name"] == "summary column_values_type EQ STRING" + assert username_constraints[1]["name"] == "summary unique_count BTWN 3 and 5" # we have 4 unique values in df + assert f"summary most_common_value IN" in username_constraints[2]["name"] + + assert len(followers_constraints) == 5 + # min greater than 0, mean between mean-stddev and mean+stddev, + # column values type, most common value in set, unique count between + + followers_mean = data["followers"].mean() + followers_stddev = data["followers"].std() + lower_followers = followers_mean - followers_stddev + upper_followers = followers_mean + followers_stddev + + assert followers_constraints[0]["name"] == "summary min GE 0/None" + assert followers_constraints[1]["name"] == f"summary mean BTWN {lower_followers} and {upper_followers}" + assert followers_constraints[2]["name"] == "summary column_values_type EQ INTEGRAL" + assert followers_constraints[3]["name"] == "summary unique_count BTWN 3 and 5" # we have 4 unique values in the df + assert "summary most_common_value IN" in followers_constraints[4]["name"] From 627addaa46e4563b99e43892bae0d8ece3fafc30 Mon Sep 17 00:00:00 2001 From: MilenaTrajanoska Date: Tue, 25 Jan 2022 13:35:11 +0100 Subject: [PATCH 06/10] Fix failing notebook test --- src/whylogs/core/columnprofile.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/whylogs/core/columnprofile.py b/src/whylogs/core/columnprofile.py index d4cc08e862..6cc23f791f 100644 --- a/src/whylogs/core/columnprofile.py +++ b/src/whylogs/core/columnprofile.py @@ -182,12 +182,18 @@ def generate_constraints(self) -> SummaryConstraints: if summ.min >= 0: items.append(minGreaterThanEqualConstraint(value=0)) - items.append( - meanBetweenConstraint( - lower_value=summ.mean - summ.stddev, - upper_value=summ.mean + summ.stddev, + + mean_lower = summ.mean - summ.stddev + mean_upper = summ.mean + summ.stddev + + if mean_lower != mean_upper: + items.append( + meanBetweenConstraint( + lower_value=mean_lower, + upper_value=mean_upper, + ) ) - ) + if summ.max <= 0: items.append(maxLessThanEqualConstraint(value=0)) From 29b15b2ae699505aa83b2c049f0c7b0898ed8238 Mon Sep 17 00:00:00 2001 From: MilenaTrajanoska Date: Tue, 25 Jan 2022 13:42:48 +0100 Subject: [PATCH 07/10] Change import scipy.special to import scipy --- src/whylogs/core/summaryconverters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/whylogs/core/summaryconverters.py b/src/whylogs/core/summaryconverters.py index 483f4ea339..75e43f0367 100644 --- a/src/whylogs/core/summaryconverters.py +++ b/src/whylogs/core/summaryconverters.py @@ -6,7 +6,7 @@ import datasketches import numpy as np -import scipy.special +import scipy from datasketches import ( frequent_items_error_type, frequent_strings_sketch, From 092ca07dee1cb1f07a6ada31a63f6aabb7ef252a Mon Sep 17 00:00:00 2001 From: MilenaTrajanoska Date: Tue, 25 Jan 2022 14:07:13 +0100 Subject: [PATCH 08/10] Change import scipy to from scipy import special and from scipy import stats --- src/whylogs/core/summaryconverters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/whylogs/core/summaryconverters.py b/src/whylogs/core/summaryconverters.py index 75e43f0367..bf48dfa6eb 100644 --- a/src/whylogs/core/summaryconverters.py +++ b/src/whylogs/core/summaryconverters.py @@ -6,13 +6,13 @@ import datasketches import numpy as np -import scipy from datasketches import ( frequent_items_error_type, frequent_strings_sketch, kll_floats_sketch, update_theta_sketch, ) +from scipy import special, stats from whylogs.proto import ( ColumnSummary, @@ -234,7 +234,7 @@ def ks_test_compute_p_value(target_distribution: kll_floats_sketch, reference_di if D > D_max: D_max = D n_samples = min(target_distribution.get_n(), reference_distribution.get_n()) - p_value = scipy.special.kolmogorov(np.sqrt(n_samples) * D_max) + p_value = special.kolmogorov(np.sqrt(n_samples) * D_max) return type("Object", (), {"ks_test": p_value}) @@ -323,5 +323,5 @@ def compute_chi_squared_test_p_value(target_distribution: ReferenceDistributionD chi_sq += (i_frequency - ref_frequency) ** 2 / ref_frequency degrees_of_freedom = target_unique_count - 1 - p_value = scipy.stats.chi2.sf(chi_sq, degrees_of_freedom) + p_value = stats.chi2.sf(chi_sq, degrees_of_freedom) return type("Object", (), {"chi_squared_test": p_value}) From 219bc7b78b7a729664b48788b01796aea47b0780 Mon Sep 17 00:00:00 2001 From: pecop2 Date: Wed, 26 Jan 2022 20:08:18 +0100 Subject: [PATCH 09/10] Added new example for table shape constraints and sequentially logging data more than once --- examples/Constraints_Suite.ipynb | 265 ++++++++++++++++++++++++++++++- 1 file changed, 261 insertions(+), 4 deletions(-) diff --git a/examples/Constraints_Suite.ipynb b/examples/Constraints_Suite.ipynb index 9d87e1e485..0c28b02ee1 100644 --- a/examples/Constraints_Suite.ipynb +++ b/examples/Constraints_Suite.ipynb @@ -1685,7 +1685,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 66, "id": "cceadc2d", "metadata": {}, "outputs": [ @@ -1700,7 +1700,7 @@ " table total_row_number EQ 14 1 0\n", " table columns CONTAIN this_column_does_not_exist 1 1\n", " table columns CONTAIN col2 1 0\n", - " table columns EQ {'this', 'is', 'set', 'wrong', 'columns', 'a'} 1 1\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 1 1\n", " table columns EQ {'str1', 'col2'} 1 0\n" ] } @@ -1709,7 +1709,25 @@ "from whylogs.core.statistics.constraints import (\n", " numberOfRowsConstraint, columnExistsConstraint, columnsMatchSetConstraint )\n", "\n", - "# using the above dataframe with the string values, just adding a column\n", + "df = pd.DataFrame(\n", + " [\n", + " {\"str1\": \"random1\"},\n", + " {\"str1\": \"random2\"},\n", + " {\"str1\": \"random 4-1\"},\n", + " {\"str1\": \"4 random\"},\n", + " {\"str1\": \"whylogs rocks!\"},\n", + " {\"str1\": \" \"},\n", + " {\"str1\": 12},\n", + " {\"str1\": {\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232}},\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": 232, \"abc\": 1})},\n", + " {\"str1\": json.dumps({\"name\": \"s\", \"w2w2\": \"dgsg\", \"years\": \"232\", \"abc\": 1})},\n", + " {\"str1\": \"random str : fail everything\"},\n", + " {\"str1\": \"2003-12-23\"},\n", + " {\"str1\": \"2003-15-23\"},\n", + " {\"str1\": \"10-12-32\"},\n", + " ]\n", + " )\n", + "\n", "df['col2'] = range(len(df))\n", "\n", "rows = numberOfRowsConstraint(n_rows=len(df)+1) # fail\n", @@ -1733,6 +1751,245 @@ "format_report(report)" ] }, + { + "cell_type": "markdown", + "id": "1c44652a", + "metadata": {}, + "source": [ + "### Table shape example 2" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "339bdec3", + "metadata": {}, + "outputs": [], + "source": [ + "logger = session.logger(dataset_name=\"test2.data\", constraints=dc)\n", + "logger.log_dataframe(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "56629731", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 2 2\n", + " table total_row_number EQ 14 2 0\n", + " table columns CONTAIN this_column_does_not_exist 2 2\n", + " table columns CONTAIN col2 2 0\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 2 2\n", + " table columns EQ {'str1', 'col2'} 2 0\n" + ] + } + ], + "source": [ + "report = logger.profile.apply_table_shape_constraints()\n", + "format_report(report)" + ] + }, + { + "cell_type": "markdown", + "id": "39f56f39", + "metadata": {}, + "source": [ + "Logging another dataframe with different DatasetProfile but the same DatasetConstraints, just an example" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "6f510f16", + "metadata": {}, + "outputs": [], + "source": [ + "logger.log({\"this_column_does_not_exist\": 1}) # logging a new non existent column" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "63280b15", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 3 3\n", + " table total_row_number EQ 14 3 0\n", + " table columns CONTAIN this_column_does_not_exist 3 2\n", + " table columns CONTAIN col2 3 0\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 3 3\n", + " table columns EQ {'str1', 'col2'} 3 1\n" + ] + } + ], + "source": [ + "report2 = logger.profile.apply_table_shape_constraints()\n", + "format_report(report2)" + ] + }, + { + "cell_type": "markdown", + "id": "78904486", + "metadata": {}, + "source": [ + "After logging the column 'this_column_does_not_exist', the total row number stays the same, \n", + "so the numberOfRowsConstraint passed.\n", + "\n", + "**'table columns CONTAIN this_column_does_not_exist'** constraint now passed, since the column now exists, but\n", + "\n", + "**'table columns EQ {'str1', 'col2'}'** now failed, because new column was logged\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "e2c41e44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table columns EQ {'str1', 'col2', 'this_column_does_not_exist'} 1 0\n" + ] + } + ], + "source": [ + "set2 = set(columns_set)\n", + "set2.add(\"this_column_does_not_exist\")\n", + "\n", + "columns_match3 = columnsMatchSetConstraint(set2) # new constraint containing the new column\n", + "\n", + "report3 = logger.profile.apply_table_shape_constraints(SummaryConstraints([columns_match3])) # applying just the new constraint\n", + "format_report(report3)" + ] + }, + { + "cell_type": "markdown", + "id": "e45f346c", + "metadata": {}, + "source": [ + "After adding the new column to **'set2'** and creating a **columnsMatchSetConstraint** with it, now it doesn't fail" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "688f2478", + "metadata": {}, + "outputs": [], + "source": [ + "log_dict = dict()\n", + " # logging a new value for every column (one more row)\n", + "for column in df.columns:\n", + " value = df[column][10] # sample from the column\n", + " log_dict[column] = value\n", + "logger.log(log_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "668c93c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 4 3\n", + " table total_row_number EQ 14 4 1\n", + " table columns CONTAIN this_column_does_not_exist 4 2\n", + " table columns CONTAIN col2 4 0\n", + " table columns EQ {'is', 'a', 'this', 'columns', 'set', 'wrong'} 4 4\n", + " table columns EQ {'str1', 'col2'} 4 2\n" + ] + } + ], + "source": [ + "report4 = logger.profile.apply_table_shape_constraints()\n", + "format_report(report4)" + ] + }, + { + "cell_type": "markdown", + "id": "f3f3f878", + "metadata": {}, + "source": [ + "**'table total_row_number EQ 14'** now failed since new row was logged" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "a0ba26b5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Table shape constraint failures -\n", + " test_name total_run failed\n", + " table total_row_number EQ 15 1 0\n" + ] + } + ], + "source": [ + "rows_3 = numberOfRowsConstraint(n_rows=len(df.index) + 1) # new numberOfRowsConstraint\n", + "report5 = logger.profile.apply_table_shape_constraints(SummaryConstraints([rows_3]))\n", + "format_report(report5)" + ] + }, + { + "cell_type": "markdown", + "id": "b48a12eb", + "metadata": {}, + "source": [ + "Creating a new **numberOfRowsConstraint** with n_rows = previous_n_rows + 1 and applying it, now passed." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "e6058796", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15\n" + ] + } + ], + "source": [ + "profile = logger.close() # closing the logger and getting the DatasetProfile\n", + "print (profile.total_row_number)" + ] + }, { "cell_type": "markdown", "id": "1956cf2b", @@ -2380,7 +2637,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.9.7" } }, "nbformat": 4, From da4bf16e28c31dc33e551a4f862817e7f8cbf992 Mon Sep 17 00:00:00 2001 From: pecop2 Date: Wed, 26 Jan 2022 22:36:29 +0100 Subject: [PATCH 10/10] Edit default total_row_number in DatasetProfile. Change the way of getting the row values when logging a dataframe. --- src/whylogs/core/columnprofile.py | 1 - src/whylogs/core/datasetprofile.py | 16 +++++++++------- tests/unit/core/statistics/test_constraints.py | 2 ++ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/whylogs/core/columnprofile.py b/src/whylogs/core/columnprofile.py index 026842d18f..df8e1d6dd5 100644 --- a/src/whylogs/core/columnprofile.py +++ b/src/whylogs/core/columnprofile.py @@ -10,7 +10,6 @@ ) from whylogs.core.statistics.constraints import ( MultiColumnValueConstraints, - SummaryConstraint, SummaryConstraints, ValueConstraints, columnMostCommonValueInSetConstraint, diff --git a/src/whylogs/core/datasetprofile.py b/src/whylogs/core/datasetprofile.py index e2aa0a62e1..a9827f064d 100644 --- a/src/whylogs/core/datasetprofile.py +++ b/src/whylogs/core/datasetprofile.py @@ -167,7 +167,8 @@ def session_timestamp_ms(self): @property def total_row_number(self): - return max(self.column_row_dict.values()) + dict_counts = self.column_row_dict.values() if len(self.column_row_dict) else [0] + return max(dict_counts) def add_output_field(self, field: Union[str, List[str]]): if self.model_profile is None: @@ -311,17 +312,18 @@ def track_dataframe(self, df: pd.DataFrame, character_list=None, token_method=No large_df = element_count > 200000 if large_df: logger.warning(f"About to log a dataframe with {element_count} elements, logging might take some time to complete.") + count = 0 - columns_len = len(df.columns) num_records = len(df) for idx in range(num_records): - row_values = df.iloc[idx].values + row_values = [] count += 1 - for col_idx in range(columns_len): - col = df.columns[col_idx] - col_str = str(col) - self.track(col_str, row_values[col_idx], character_list=None, token_method=None) + for col in df.columns: + col_values = df[col].values + value = col_values[idx] + row_values.append(value) + self.track(col, value, character_list=None, token_method=None) if large_df and (count % 200000 == 0): logger.warning(f"Logged {count} elements out of {element_count}") diff --git a/tests/unit/core/statistics/test_constraints.py b/tests/unit/core/statistics/test_constraints.py index 5fe13cae82..59d9e5c0a3 100644 --- a/tests/unit/core/statistics/test_constraints.py +++ b/tests/unit/core/statistics/test_constraints.py @@ -2395,6 +2395,8 @@ def test_generate_default_constraints_mixed(local_config_path): assert followers_constraints[2]["name"] == "summary column_values_type EQ INTEGRAL" assert followers_constraints[3]["name"] == "summary unique_count BTWN 3 and 5" # we have 4 unique values in the df assert "summary most_common_value IN" in followers_constraints[4]["name"] + + def _apply_value_constraints_on_dataset(df_lending_club, local_config_path, value_constraints=None, multi_column_value_constraints=None): dc = DatasetConstraints(None, value_constraints=value_constraints, multi_column_value_constraints=multi_column_value_constraints) config = load_config(local_config_path)