In [0]:
%pip install databricks-labs-dqx

Collecting databricks-labs-dqx
  Downloading databricks_labs_dqx-0.7.0-py3-none-any.whl.metadata (3.4 kB)
Collecting databricks-labs-blueprint<0.10,>=0.9.1 (from databricks-labs-dqx)
  Downloading databricks_labs_blueprint-0.9.3-py3-none-any.whl.metadata (55 kB)
Collecting databricks-labs-lsql<0.15,>=0.5 (from databricks-labs-dqx)
  Downloading databricks_labs_lsql-0.14.2-py3-none-any.whl.metadata (8.7 kB)
Collecting databricks-sdk~=0.57 (from databricks-labs-dqx)
  Downloading databricks_sdk-0.58.0-py3-none-any.whl.metadata (39 kB)
Collecting sqlglot>=22.3.1 (from databricks-labs-lsql<0.15,>=0.5->databricks-labs-dqx)
  Downloading sqlglot-27.0.0-py3-none-any.whl.metadata (20 kB)
Downloading databricks_labs_dqx-0.7.0-py3-none-any.whl (85 kB)
Downloading databricks_labs_blueprint-0.9.3-py3-none-any.whl (61 kB)
Downloading databricks_labs_lsql-0.14.2-py3-none-any.whl (47 kB)
Downloading databricks_sdk-0.58.0-py3-none-any.whl (741 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [0]:
from databricks.labs.dqx import check_funcs
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.rule import DQForEachColRule

# Create a sample DataFrame
data = [
    ("Alice", 1, "USA"),
    ("Bob", 2, "Canada"),
    ("Charlie", None, "Mexico"),
    (None, 4, None),
]

columns = ["Name", "ID", "Country"]
input_df = spark.createDataFrame(data, columns)

# Defining a simple data quality rule for 2 columns to check that the Country and Name columns are not null.

checks = [
    *DQForEachColRule(
        criticality="error",
        check_func=check_funcs.is_not_null,
        columns=["Country", "Name"],  # Apply to both columns
    ).get_rules(),
]

print("Original DataFrame:")
display(input_df)

# Create a DQEngine instance using the Databricks Workspace client
# This ensures DQX can interact with your Databricks environment's services.

dq_engine = DQEngine(WorkspaceClient())

# Run the data quality check
valid_df, quarantine_df = dq_engine.apply_checks_and_split(input_df, checks)

print("\nValid Rows (passed all checks):")
display(valid_df)

print("\nQuarantined Rows (failed one or more checks):")
display(quarantine_df)

Original DataFrame:


Name,ID,Country
Alice,1.0,USA
Bob,2.0,Canada
Charlie,,Mexico
,4.0,



Valid Rows (passed all checks):


Name,ID,Country
Alice,1.0,USA
Bob,2.0,Canada
Charlie,,Mexico



Quarantined Rows (failed one or more checks):


Name,ID,Country,_errors,_warnings
,4,,"List(List(country_is_null, Column 'Country' value is null, List(Country), null, is_not_null, 2025-07-11T17:36:35.557Z, Map()), List(name_is_null, Column 'Name' value is null, List(Name), null, is_not_null, 2025-07-11T17:36:35.557Z, Map()))",
