### Process Orders Data
1. Ingest the data into the data lakehouse - stg_orders
2. Perform data quality checks and transform the data as required - stg_orders_clean
3. Apply changes to the Addresses data (SCD Type 2) - raw_orders

In [None]:
import dlt
from pyspark.sql.functions import col, current_timestamp, current_date, explode

#### 1. Ingest the data into the data lakehouse - stg_orders

In [None]:

@dlt.table(
    name="stg_orders",  # you can assign a schema name here as well: <schema_name>.bronze_orders
    comment="The orders data ingested from the order's data lakehouse.",
    table_properties={
        "quality": "staging",
        "delta.autoOptimize.optimizeWrite": "true"
    }
)
def stg_orders():
    df_orders = spark \
        .readStream \
        .format("cloudFiles") \
        .option("cloudFiles.format", "json") \
        .option("cloudFiles.inferSchema", "true") \
        .option("cloudFiles.inferColumnTypes", "true") \
        .option("cloudFiles.schemaLocation", "/Volumes/circuitbox/landing/operational_data/schema/orders/") \
        .load("/Volumes/circuitbox/landing/operational_data/orders/")

    df_orders = df_orders \
        .withColumn("input_file_path", col("_metadata.file_path")) \
        .withColumn("ingest_timestamp", current_timestamp()) \
        .withColumn("load_date", current_date())

    return df_orders

#### 2. Perform data quality checks and transform the data as required - stg_orders_clean

In [None]:
@dlt.table(
    name="stg_orders_clean",
    comment="Cleaned orders data",
    table_properties={'quality': 'staging'}
)
@dlt.expect_or_fail("valid_customer_id", "customer_id IS NOT NULL")
@dlt.expect_or_drop("valid_order_id", "order_id IS NOT NULL")
@dlt.expect("valid_order_status", "order_status IN ('Pending', 'Shipped', 'Cancelled', 'Completed')")
def stg_orders_clean():
    df_stg_orders_clean = spark.readStream.table("LIVE.stg_orders") \
        .select(
        "order_id",
        "customer_id",
        col("order_timestamp").cast("date"),
        "payment_method",
        "items",
        "order_status"
    ) \
        .withColumn("ingest_timestamp", current_timestamp()) \
        .withColumn("load_date", current_date())
    return df_stg_orders_clean

#### 3. Apply changes to the Addresses data (SCD Type 2) - raw_orders

In [None]:
@dlt.table(
    name="raw_orders",
    comment="Flattened streaming orders with exploded items",
    table_properties={'quality': 'raw'}
)
def silver_orders():

    raw_orders_clean = dlt.read_stream("stg_orders_clean")

    raw_orders_clean = raw_orders_clean\
    .withColumn("item", explode(col("items")))\
    .select(
        "order_id",
        "customer_id",
        "order_timestamp",
        "payment_method",
        "order_status",
        col("item.item_id").alias("item_id"),
        col("item.name").alias("item_name"),
        col("item.price").alias("item_price"),
        col("item.quantity").alias("item_quantity"),
        col("item.category").alias("item_category")
    )\
    .withColumn("ingest_timestamp", current_timestamp()) \
    .withColumn("load_date", current_date())

    return raw_orders_clean


