# Process Orders Data
1. Ingest the data into the data lakehouse - bronze_orders
2. Perform data quality checks and transform the data as required - silver_orders_clean
3. Apply changes to the Adresses data (SCD Type 2) - silver_orders

Note: Within a DLT pipeline you can have some notebooks in Python, some notebooks in SQL.  But within a notebook, you can't mix languages, either all python or all SQL.  Can't have magic commands.

1. Ingest the data into the data lakehouse - bronze_orders

In [0]:
import dlt
import pyspark.sql.functions as F

@dlt.table(
    name="bronze_orders",
    table_properties={"quality": "bronze"},
    comment="Raw addresses data ingested from the source system",
)
def create_bronze_orders():
    data = spark.readStream \
        .format("cloudFiles") \
        .option("cloudFiles.format", "json") \
        .option("cloudFiles.inferColumnTypes", "true") \
        .load("/mnt/circuitbox/landing/operational_data/orders")

    return data.select(
        "*",
        F.col("_metadata.file_path").alias("input_file_path"),
        F.current_timestamp().alias("ingestion_date"),
    )

2. Perform data quality checks and transformthe data as required silver_orders_clean

In [0]:
@dlt.table(
    name="silver_orders_clean",
    comment = "Cleaned orders data",
    table_properties = "{'quality': 'silver'}",
)
@dlt.expect_all_or_fail(
    {
        "valid_customer_id", "customer_id IS NOT NULL",
        "valid_order_id", "order_id IS NOT NULL"
    }
)
@dlt.expect_all_(
    {
        "valid_order_status", "order_status IN (
            'Pending',
            'Shipped',
            'Cancelled',
            'Completed'
        )"
    }
)
def create_silver_orders_clean():
    return spark.readStream.table("LIVE.bronze_orders") \
        .select(
            "order_id",
            "customer_id",
            F.col("order_timestamp").cast("timestamp")
            "payment_method",
            explode(array_distinct(items)) AS item            
            "order_status",
        )

3. Apply changes to the Orders data (SCD Type 2) - silver_addresses

In [0]:
dlt.create_streaming_table(
    name="silver_orders",
    comment = "SCD Type 2 orders data",
    table_properties = "{'quality': 'silver'}",
)

dlt.apply_changes(
    target = "silver_orders",
    source = "silver_orders_clean",
    keys = ["order_id", customer_id"],
    sequence_by = "order_timestamp",
    stored_as_scd_type = 2,
)