In [0]:
%pip install databricks-labs-dqx==0.9.3

In [0]:
dbutils.library.restartPython()

In [0]:
# Let's read from the sample data from the Databricks volume
df_sample = spark.read.parquet("/Volumes/securehome/raw/phoenix/phoenix_10pct_data/")

In [0]:
# import DQX profiler, and the WorkspaceClient
import yaml
from databricks.sdk import WorkspaceClient
from databricks.labs.dqx.engine import DQEngine
from databricks.labs.dqx.profiler.generator import DQGenerator


# We initialize the profiler with a WorkspaceClient, which gives it access to Databricks context.
ws = WorkspaceClient()
dq_engine = DQEngine(WorkspaceClient())

In [0]:
dq_engine = DQEngine(WorkspaceClient())
checks_str = """
  - check:
      arguments:
        column: user_id
      function: is_not_null
    criticality: error
    name: user_id_is_null
  - check:
      arguments:
        column: firstname
      function: is_not_null
    criticality: error
    name: firstname_is_null
  - check:
      arguments:
        column: lastname
      function: is_not_null
    criticality: error
    name: lastname_is_null
  - check:
      arguments:
        column: email_address
      function: is_not_null
    criticality: error
    name: email_address_is_null
  - check:
      arguments:
        column: phone_number
      function: is_not_null
    criticality: error
    name: phone_number_is_null
  - check:
      arguments:
        column: ingestion_timestamp
      function: is_not_null
    criticality: error
    name: ingestion_timestamp_is_null
  - check:
      arguments:
        column: source_system
      function: is_not_null
    criticality: error
    name: source_system_is_null
  - check:
      arguments:
        column: email_address
        regex: \\s
        negate: true
      function: regex_match
    criticality: error
    name: email_address_is_invalid
"""

# Load the checks as a Dictionary
checks = yaml.safe_load(checks_str)

# Split the good data (ones without issues) from the bad data (ones with issues)
good_df, bad_df = dq_engine.apply_checks_by_metadata_and_split(df_sample, checks)

In [0]:
bad_df.select('email_address', "_errors", "_warnings").display()

In [0]:
%sql
-- Create Catalog and Schema if they don't exist
CREATE CATALOG IF NOT EXISTS securehome;
CREATE SCHEMA IF NOT EXISTS securehome.temp;

In [0]:
from pyspark.sql.functions import col, regexp_replace

# --- Step 1: TRANSFORM (The "Cleaner") ---
# Apply the fix non-destructively
df_silver = bad_df.withColumn(
    "email_address_no_whitespace", 
    regexp_replace(col("email_address"), "\\s+", "") # Removes all whitespace
)

print("Bronze to Silver 'Cleaner' job complete.")
df_silver.select("user_id", "email_address", "email_address_no_whitespace").limit(10).display()

In [0]:
# --- Step 2: Write to Silver ---
df_silver.write.mode("overwrite").format("delta").saveAsTable("securehome.temp.silver_cleaned_email_phoenix")


In [0]:
cleaned_email_address_df = bad_email_address_df.withColumn("email_address_no_whitespace", F.regexp_replace("email_address", r"(\s+)", ""))

cleaned_email_address_df.display()