In [0]:
# Cell 1
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [0]:
# Cell 2
@dlt.table(
    name="demo_cust_bronze_sd",
    comment="Bronze: raw JSON ingestion (cloudFiles) with schemaEvolutionMode=rescue"
)
def bronze():
    # Adjust the path below to your monitored folder if needed
    input_path = "/Volumes/workspace/damg7370/datastore/json file"
    return (
        spark.readStream
             .format("cloudFiles")
             .option("cloudFiles.format", "json")
             .option("cloudFiles.inferColumnTypes", "true")
             .option("cloudFiles.schemaEvolutionMode", "rescue")
             .load(input_path)
             .withColumn("ingestion_datetime", current_timestamp())
             .withColumn("source_filename", col("_metadata.file_path"))
    )


In [0]:
# Cell 3
# safe cast helper: treats empty string as null, does to_date conversion for DateType
def safe_cast_col(col_expr, target_type):
    if isinstance(target_type, DateType):
        return when(trim(col_expr) == "", None).otherwise(to_date(trim(col_expr)))
    else:
        return when(trim(col_expr) == "", None).otherwise(col_expr.cast(target_type))

# apply datatype changes based on provided StructType
def process_rescued_datatype_changes(df, target_schema: StructType):
    # parse _rescued_data to map if present
    if "_rescued_data" in df.columns:
        df = df.withColumn("_rescued_map", from_json(col("_rescued_data"), MapType(StringType(), StringType())))
    else:
        df = df.withColumn("_rescued_map", lit(None).cast(MapType(StringType(), StringType())))

    for field in target_schema.fields:
        column_name = field.name
        data_type = field.dataType

        key_condition = (col("_rescued_map").isNotNull()) & (map_keys(col("_rescued_map")).isNotNull())  # conservative
        # prefer rescued value if exists, otherwise existing column value
        rescued_val = col("_rescued_map").getItem(column_name)
        merged_raw = when(rescued_val.isNotNull(), rescued_val).otherwise(col(column_name))
        df = df.withColumn(column_name, safe_cast_col(merged_raw, data_type))

    df = df.drop("_rescued_map")
    # clear _rescued_data after processing
    if "_rescued_data" in df.columns:
        df = df.withColumn("_rescued_data", lit(None).cast(StringType()))
    return df


In [0]:
# Cell 4
# Define which fields we want to coerce to a specific type (example: signupDate -> Date)
updated_datatypes = StructType([ StructField("signupDate", DateType(), True) ])

@dlt.table(
    name="demo_cust_silver_sd",
    comment="Silver: cleaned records with new fields promoted and signupDate cast to date"
)
def silver():
    bronze_df = dlt.read("demo_cust_bronze_sd")

    # If _rescued_data exists, extract keys in this DLT query definition (allowed)
    discovered_keys = []
    if "_rescued_data" in bronze_df.columns:
        # Build a map column to extract keys (works within DLT dataset def)
        rescued_map_col = from_json(col("_rescued_data"), MapType(StringType(), StringType()))
        keys_df = bronze_df.select(explode(map_keys(rescued_map_col)).alias("rescued_key")) \
                           .distinct() \
                           .filter(col("rescued_key").isNotNull())

        # collect discovered keys (this is executed at pipeline runtime inside the dataset definition)
        discovered_keys = [r["rescued_key"] for r in keys_df.collect()]

    # If no discovered keys, return bronze after datatype processing (still applied)
    df = bronze_df

    # Add discovered fields as columns (string typed by default)
    if discovered_keys:
        # parse rescued map once and then add new columns from it
        df = df.withColumn("_rescued_map", from_json(col("_rescued_data"), MapType(StringType(), StringType())))
        for k in discovered_keys:
            if k is None or k == "_file_path":
                continue
            if k not in df.columns:
                df = df.withColumn(k, col("_rescued_map").getItem(k).cast(StringType()))
        df = df.drop("_rescued_map")

    # Apply datatype changes (e.g., signupDate -> DateType)
    df = process_rescued_datatype_changes(df, updated_datatypes)

    # final cleanup: ensure _rescued_data is cleared
    if "_rescued_data" in df.columns:
        df = df.withColumn("_rescued_data", lit(None).cast(StringType()))

    return df


In [0]:
# Cell 5 (optional)
# Example DLT expectations: CustomerID should be present
@dlt.expect_or_drop("valid_customerid", "CustomerID IS NOT NULL")
def silver_with_expectations():
    return dlt.read("demo_cust_silver_sd")
