### Silver Processing

In [6]:
from pyspark.sql.types import *
from delta.tables import *
from pyspark.sql.functions import when, lit, col, current_timestamp, input_file_name

# Read in bronze
df = spark.read.table("orders_bronze")

# Add columns IsFlagged, CreatedTS and ModifiedTS
df = df.withColumn("IsFlagged", when(col("OrderDate") < '2019-08-01',True).otherwise(False)) \
    .withColumn("CreatedTS", current_timestamp()).withColumn("ModifiedTS", current_timestamp())

# Update CustomerName to "Unknown" if CustomerName null or empty
df = df.withColumn("CustomerName", when((col("CustomerName").isNull() | (col("CustomerName")=="")),lit("Unknown")).otherwise(col("CustomerName")))

# Instantiate silver table
DeltaTable.createIfNotExists(spark) \
    .tableName("orders_silver") \
    .addColumn("SalesOrderNumber", StringType()) \
    .addColumn("SalesOrderLineNumber", IntegerType()) \
    .addColumn("OrderDate", DateType()) \
    .addColumn("CustomerName", StringType()) \
    .addColumn("Email", StringType()) \
    .addColumn("Item", StringType()) \
    .addColumn("Quantity", IntegerType()) \
    .addColumn("UnitPrice", FloatType()) \
    .addColumn("Tax", FloatType()) \
    .addColumn("IsFlagged", BooleanType()) \
    .addColumn("CreatedTS", DateType()) \
    .addColumn("ModifiedTS", DateType()) \
    .execute()


StatementMeta(, 16372a78-8038-4b20-a019-b5f53c4653b3, 8, Finished, Available, Finished)

<delta.tables.DeltaTable at 0x794da265dc50>

In [9]:
# Update existing records and insert new ones based on a condition defined by the columns SalesOrderNumber, OrderDate, CustomerName, and Item.

from delta.tables import *
from datetime import datetime 
import time

deltaTable = DeltaTable.forName(spark, 'orders_silver')

start_time = time.time()

table_name = "orders_silver"
load_path = "qa_lakehouse_1312.orders_silver"

try:

    dfUpdates = df

    record_count = dfUpdates.count()

    deltaTable.alias('silver') \
    .merge(
        dfUpdates.alias('updates'),
        'silver.SalesOrderNumber = updates.SalesOrderNumber and silver.OrderDate = updates.OrderDate and silver.CustomerName = updates.CustomerName and silver.Item = updates.Item'
    ) \
    .whenMatchedUpdate(set =
        {

        }
    ) \
    .whenNotMatchedInsert(values =
        {
        "SalesOrderNumber": "updates.SalesOrderNumber",
        "SalesOrderLineNumber": "updates.SalesOrderLineNumber",
        "OrderDate": "updates.OrderDate",
        "CustomerName": "updates.CustomerName",
        "Email": "updates.Email",
        "Item": "updates.Item",
        "Quantity": "updates.Quantity",
        "UnitPrice": "updates.UnitPrice",
        "Tax": "updates.Tax",
        "IsFlagged": "updates.IsFlagged",
        "CreatedTS": "updates.CreatedTS",
        "ModifiedTS": "updates.ModifiedTS"
        }
    ) \
    .execute()

    load_status = "SUCCESS"
    error_message = None 

except Exception as e:
    record_count = 0 
    load_status = "FAILURE"
    error_message = str(e)

end_time = time.time()
duration = (end_time - start_time)

monitoring_schema = StructType([
    StructField("load_timestamp", TimestampType()),
    StructField("Table_name", StringType()),
    StructField("Source_path", StringType()),
    StructField("record_count", IntegerType()),
    StructField("Status", StringType()),
    StructField("error_message", StringType()),
    StructField("duration", FloatType())
])

monitoring_data = [(datetime.now(), table_name, load_path, record_count, load_status, error_message, duration)]

monitoring_df = spark.createDataFrame(monitoring_data, schema=monitoring_schema)

monitoring_df.write.mode("append").saveAsTable("monitoring_etl")

StatementMeta(, 16372a78-8038-4b20-a019-b5f53c4653b3, 11, Finished, Available, Finished)