In [0]:
%pip install databricks-labs-dqx==0.11.1

In [0]:
%restart_python

In [0]:
import yaml
from databricks.labs.dqx import check_funcs
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.rule import DQRowRule

In [0]:
# Create a sample DataFrame
data = [
    (1, "Alice", "USA"),
    (None, "Charlie", "Mexico"),
    (3, "Bob", "Canada"),
]

columns = ["customer_id", "first_name", "country"]
input_df = spark.createDataFrame(data, columns)

print("Original DataFrame:")
display(input_df)


In [0]:
checks_yaml_str = """
# check whether customer_id contains NULL values

- criticality: error
  check:
    function: is_not_null
    arguments:
      column: customer_id
"""

In [0]:
dq_engine = DQEngine(WorkspaceClient())

# Run the data quality check
checks = yaml.safe_load(checks_yaml_str)

# Note we use dq_engine.apply_checks_by_metadata_and_split
valid_df, quarantine_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)

print("\nValid Rows (passed all checks):")
display(valid_df)

print("\nQuarantined Rows (failed one or more checks):")
display(quarantine_df)

In [0]:
# Defining a simple data quality rule for 3 columns to check that the Country, ID and Name columns are not null.

checks = [
    DQRowRule(
        criticality="error",
        check_func=check_funcs.is_not_null,
        column="customer_id"
    )
]

# Note we use dq_engine.apply_checks_by_metadata_and_split
valid_df, quarantine_df = dq_engine.apply_checks_and_split(input_df, checks)

print("\nValid Rows (passed all checks):")
display(valid_df)

print("\nQuarantined Rows (failed one or more checks):")
display(quarantine_df)