In [0]:
%pip install databricks-labs-dqx==0.11.1

In [0]:
%restart_python

In [0]:
import yaml
from databricks.labs.dqx import check_funcs
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.rule import DQRowRule

In [0]:
# Create a sample DataFrame
data = [
    (1, "anuja@example.com", "2025-11-10 12:12:00"),
    (2, None, "2025-11-11 16:28:00"),
    (3, "leo@example.com", "2025-11-12 16:30:00"),
    (4, "", "2025-11-12 16:35:00"),
]

columns = ["id", "email_address", "created_at"]
input_df = spark.createDataFrame(data, columns)

print("Original DataFrame:")
display(input_df)

In [0]:
checks_yaml_str = """
# check whether email_address contains null values or empty values (e.g., "")

- criticality: error
  check:
    function: is_not_null_and_not_empty
    arguments:
      column: email_address

"""

In [0]:
dq_engine = DQEngine(WorkspaceClient())

# Load the checks YAML string into a Python dictionary
checks = yaml.safe_load(checks_yaml_str)

# Apply checks and automatically split the dataframe into valid and quarantined rows
valid_df, quarantine_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)

print("\nValid Rows (passed all checks):")
display(valid_df)

print("\nQuarantined Rows (failed one or more checks):")
display(quarantine_df)

In [0]:
# Define a simple data quality rule in Python to check that email_address is not null or empty
checks = [
    DQRowRule(
        criticality="error", check_func=check_funcs.is_not_null_and_not_empty, column="email_address"
    )
]

# Apply the rule and automatically split the DataFrame into valid and quarantined rows
valid_df, quarantine_df = dq_engine.apply_checks_and_split(input_df, checks)

print("\nValid Rows (passed all checks):")
display(valid_df)

print("\nQuarantined Rows (failed one or more checks):")
display(quarantine_df)