#### About DQX

**DQX** = Python-based Data Quality framework designed for validating the quality of PySpark DataFrames developed by Databricks Labs

**Links**:
- Official docs - https://databrickslabs.github.io/dqx/
- Github repo - https://github.com/databrickslabs/dqx
- Article with review - https://www.waitingforcode.com/databricks/data-quality-databricks-dqx/read

**Some features**:
- Can evaluate dataframe and split it into valid_df and invalid_df
- Allows to profile data and automatically generate data quality rules candidates (which can be reviewed, refined and finalized as needed)
- Checks definition as code (Python) or config (yaml)
- Comes with simple Databricks dashboard for identifying and tracking data quality issues
- Allows to easily define custom checks (including using sql-based syntax: expression: "ended_at > started_at")
- Support for Spark Batch and Streaming including DLT (Delta Live Tables)

**Cons**:
- Supposed to check data against single dataframe. So, if you need to check data from several dataframes, you first need to join them into one df.
- Doesn't have standard referential integrity check out of the box.
- Dashboard looks very primitive and not quite useful.

**Use cases**:
- quality checks before-ingestion to curated layers
- post-factum (already loaded data)

**Alternatives**: dbt, Soda, Great Expectations

In [0]:
# Installing DQX in notebook

#%pip install databricks-labs-dqx

#dbutils.library.restartPython()

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import explode, col
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.rule import DQRowRule
from databricks.labs.dqx import check_funcs
import yaml


dq_engine = DQEngine(WorkspaceClient())

# Creating a sample dataframe with data quality issues
df = spark.createDataFrame(
    data = [
        #id    #name      #email                 #age  #signup_date  #gender
        (1,    "Alice",   "alice@example.com",   30,   "2022-01-15", "Female"),
        (2,    "Bob",     "bob@example.com",     25,   "2022-14-01", "Male"),
        (3,    "Charlie", "charlie@example.com", None, "2022-03-01", "Femail"),
        (4,    "Joanna",  "joice@example.com",   30,   "2022-01-15", "Female"),
        (5,    "Eve",      None,                 200,  "2022-02-30", "Female"),
        (None, "Frank",   "frank@example.com",   28,   "2022-05-20", "F"),
    ],
    schema = StructType([
        StructField("id",          IntegerType(), True),
        StructField("name",        StringType(),  True),
        StructField("email",       StringType(),  True),
        StructField("age",         IntegerType(), True),
        StructField("signup_date", StringType(),  True),
        StructField("gender",      StringType(),  True),
    ])
)

display(df)

In [0]:
# Profiling data in data frame to generate data quality profiles
ws = WorkspaceClient()

# Profile the input data
profiler = DQProfiler(ws)

# Change the default sample fraction from 30% to 100% for demo purpose
summary_stats, profiles = profiler.profile(df, options={"sample_fraction": 1.0})

print("Print summary_stats")
print(yaml.safe_dump(summary_stats))

print("Print profiles")
for profile in profiles:
    print(profile)

In [0]:
# Generating data quality checks based on the profiles

generator = DQGenerator(ws)

generated_checks = generator.generate_dq_rules(profiles)

print("Print generate data quality check candidates")
print(yaml.safe_dump(generated_checks))

In [0]:
# Define checks in YAML

yaml_checks = yaml.safe_load("""
- check:
    function: is_not_null
    arguments:
      column: id
    criticality: error
    name: id_is_null
- check:
    function: is_in_range
    arguments:
      column: age
      max_limit: 100
      min_limit: 10
    criticality: error
    name: age_isnt_in_range
- check:
    function: is_valid_date
    arguments:
      column: signup_date
    criticality: error
    name: wrong_date_format
- check:
    function: is_in_list
    arguments:
      allowed:
        - Female
        - Male
      column: gender
    criticality: error
    name: gender_is_not_in_the_list
""")

# Execute checks
valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(df, yaml_checks)
display(quarantined_df)

In [0]:
# Option 2: Defining checks in Python

python_checks = [
    DQRowRule(
        name="id_is_null",
        criticality="error",
        check_func=check_funcs.is_not_null_and_not_empty,
        column="id",
    ),
    DQRowRule(
        name="age_isnt_in_range",
        criticality="error",
        check_func=check_funcs.is_in_range,
        column="age",
        check_func_kwargs={"min_limit": 10, "max_limit": 100},
    ),
    DQRowRule(
        name="wrong_date_format",
        criticality="error",
        check_func=check_funcs.is_valid_date,
        column="signup_date"
    ),
    DQRowRule(
        criticality="error",
        check_func=check_funcs.is_in_list,
        column="gender",
        check_func_kwargs={"allowed": ["Female", "Male"]},
    )
]

# Execute checks
valid_df, quarantined_df = dq_engine.apply_checks_and_split(df, python_checks)
display(quarantined_df)

In [0]:
# Option 2: apply quality rules and flag invalid records as additional columns (`_warning` and `_error`)

#valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(df, yaml_checks)  # for yaml defined checks

valid_and_quarantined_df = dq_engine.apply_checks(df, python_checks)  # for python defined checks

# Methods to get valid and invalid dataframes
display(dq_engine.get_valid(valid_and_quarantined_df))
display(dq_engine.get_invalid(valid_and_quarantined_df))

display(valid_and_quarantined_df)

In [0]:
def _get_validation_summary_log(dqx_df_errors: DataFrame):
    """ Takes a DQX errors DataFrame, counts failures per error name, and returns a summary string """
    rows = (
        dqx_df_errors
        .select(explode("_errors").alias("err"))
        .groupby(col("err.name").alias("failed_check"))
        .count()
        .collect()
    )

    lines = [
        f'Found {r["count"]} records with failed check "{r["failed_check"]}"'
        for r in rows
    ]

    validation_summary_log = "\n".join(lines)
    return validation_summary_log

In [0]:
print(_get_validation_summary_log(quarantined_df))

In [0]:
import logging

def configure_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[logging.StreamHandler()],
        force=True
    )

configure_logging()

logging.info("test log message")

In [0]:
# Define custom exception for data quality issues
class DataQualityException(Exception):
    """Exception raised when data quality checks fail."""
    pass


def validate_df(df: DataFrame):
    """ Validates dataframe against defined data quality checks and throws DataQualityException if any issues are found """
    logging.info("Started validating dataframe")

    # Defined data quality checks
    dq_checks = [
        DQRowRule(
            name="id_is_null",
            criticality="error",
            check_func=check_funcs.is_not_null_and_not_empty,
            column="id",
        ),
        DQRowRule(
            name="age_isnt_in_range",
            criticality="error",
            check_func=check_funcs.is_in_range,
            column="age",
            check_func_kwargs={"min_limit": 10, "max_limit": 100},
        ),
        DQRowRule(
            name="wrong_date_format",
            criticality="error",
            check_func=check_funcs.is_valid_date,
            column="signup_date"
        ),
        DQRowRule(
            criticality="error",
            check_func=check_funcs.is_in_list,
            column="gender",
            check_func_kwargs={"allowed": ["Female", "Male"]},
        )
    ]

    # Apply data quality checks
    _, df_errors = dq_engine.apply_checks_and_split(df, dq_checks)

    error_count = df_errors.count()
    if error_count > 0:
        raise DataQualityException(
            f"Data quality failure:\n{_get_validation_summary_log(df_errors)}"
        )
    
    logging.info("Dataframe validated successfully")



# Validate dataframe
try:    
    validate_df(df)
except DataQualityException as e:
    print(str(e))
