### Process the Customers Data
1. Ingest the data into the data lakehouse - stg_customers
2. Perform data quality checks and transform the data as required - stg_customers_clean
3. Apply changes to the Customers data - raw_customers

In [None]:
import dlt
from pyspark.sql.functions import col, current_timestamp, current_date

#### 1. Ingest the data into the data lakehouse - stg_customers

In [None]:
@dlt.table(
    name="stg_customers",  # you can assign a schema name here as well: <schema_name>.bronze_customers
    comment="The customers data ingested from the customer's data lakehouse.",
    table_properties={
        "quality": "staging",
        "delta.autoOptimize.optimizeWrite": "true"
    }
    # path = "location to store delta table"
)
def stg_customers():
    """Ingest the customer's data into the bronze table."""
    df_stg_customer = spark \
        .readStream \
        .format("cloudFiles") \
        .option("cloudFiles.format", "json") \
        .option("cloudFiles.inferSchema", "true") \
        .option("cloudFiles.inferColumnTypes", "true") \
        .option("cloudFiles.schemaLocation", "/Volumes/circuitbox/landing/operational_data/schema/customers/") \
        .load("/Volumes/circuitbox/landing/operational_data/customers/")

    df_stg_customer = df_stg_customer \
        .withColumn("input_file_path", col("_metadata.file_path")) \
        .withColumn("ingest_timestamp", current_timestamp()) \
        .withColumn("load_date", current_date())

    return df_stg_customer

#### 2. Perform data quality checks and transform the data as required - stg_customers_clean

In [None]:
@dlt.table(
    name="stg_customers_clean",
    comment="The customers data after data quality checks and transformation.",
    table_properties={
        "quality": "staging",
        "delta.autoOptimize.optimizeWrite": "true"
    }
)
@dlt.expect_or_fail("valid_customer_id", "customer_id IS NOT NULL")
@dlt.expect_or_drop("valid_customer_name", "customer_name IS NOT NULL")
@dlt.expect("length_telephone", "LENGTH(telephone)>=10")
@dlt.expect("valid_email", "email IS NOT NULL")
def stg_customers_clean():
    """Perform data quality checks and transform the data as required."""

    df_stg_customers_clean = spark \
        .readStream \
        .table("LIVE.stg_customers") \
        .select(
        "customer_id",
        "customer_name",
        col("date_of_birth").cast("date"),
        "telephone",
        "email",
        col("created_date").cast("date"),
    ) \
        .withColumn("ingest_timestamp", current_timestamp()) \
        .withColumn("load_date", current_date())

    return df_stg_customers_clean

#### 3. Apply changes to the Customers data - raw_customers

In [None]:
dlt.create_streaming_table(
    name = "raw_customers",
    comment = "SCD Type 1 customers data",
    table_properties = {'quality' : 'raw'}
)

dlt.apply_changes(
    target="raw_customers",
    source="stg_customers_clean",
    keys=["customer_id"],
    sequence_by="created_date",
    stored_as_scd_type=1
)