In [0]:
%pip install databricks-labs-dqx==0.9.3

In [0]:
dbutils.library.restartPython()

In [0]:
# Let's read from the sample data from the Databricks volume
df_sample = spark.read.parquet("/Volumes/securehome/raw/phoenix/phoenix_10pct_data/")

In [0]:
# import DQX profiler, and the WorkspaceClient
import pandas as pd
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.sdk import WorkspaceClient
import yaml
# from databricks.labs.dqx.check_funcs import is_not_null_and_not_empty
from databricks.labs.dqx.engine import DQEngine
from databricks.labs.dqx.profiler.generator import DQGenerator


# We initialize the profiler with a WorkspaceClient, which gives it access to Databricks context.
ws = WorkspaceClient()
profiler = DQProfiler(ws)
dq_engine = DQEngine(WorkspaceClient())

summary_stats, quality_profiles = profiler.profile(df_sample)

In [0]:
# Let's generate some Data quality checks that we will use
generator = DQGenerator(ws)
checks = generator.generate_dq_rules(quality_profiles)

# See the generated Data quality checks which the profiler sugggested
print(yaml.safe_dump(checks))

Run `is_not_null` checks now for the 6 columns and and `regex_match` for the email_address column

In [0]:
# Create dataframe
input_df = spark.read.parquet("/Volumes/securehome/raw/phoenix/phoenix_10pct_data/")

In [0]:
dq_engine = DQEngine(WorkspaceClient())
checks_str = """
  - check:
      arguments:
        column: user_id
      function: is_not_null
    criticality: error
    name: user_id_is_null
  - check:
      arguments:
        column: firstname
      function: is_not_null
    criticality: error
    name: firstname_is_null
  - check:
      arguments:
        column: lastname
      function: is_not_null
    criticality: error
    name: lastname_is_null
  - check:
      arguments:
        column: email_address
      function: is_not_null
    criticality: error
    name: email_address_is_null
  - check:
      arguments:
        column: phone_number
      function: is_not_null
    criticality: error
    name: phone_number_is_null
  - check:
      arguments:
        column: ingestion_timestamp
      function: is_not_null
    criticality: error
    name: ingestion_timestamp_is_null
  - check:
      arguments:
        column: source_system
      function: is_not_null
    criticality: error
    name: source_system_is_null
  - check:
      arguments:
        column: email_address
        regex: ^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$
        negate: true
      function: regex_match
    criticality: error
    name: email_address_is_invalid
"""

# Load the checks as a Dictionary
checks = yaml.safe_load(checks_str)

# Split the good data (ones without issues) from the bad data (ones with issues)
good_df, bad_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)

In [0]:
# Records with no Data Quality issues
good_df.display()

In [0]:
# Records with Data Quality issues
bad_df.display()