In [0]:
# DLT works with 3 types of datasets
# Streaming Tables (Permanent/Temporary) - used as Append Data Sources, Incremental data
# Materialized views - used for transformation, aggregations or computations
# Views - used for intermediate transformations, not stored in target schema

import dlt

In [0]:
# create a streaming table for orders
@dlt.table(
  table_properties = {"quality" : "bronze"},
  comment = "Streaming table for orders bronze table"
)
def orders_bronze():
  df = spark.readStream.table("dev.bronze.orders_raw")
  return df

#@dlt.create_streaming_table(comment = "Streaming table for orders")
#def orders():
#  return spark.readStream.table("orders")*/

In [0]:
# create a materialized views for customers
# create a streaming table for orders
@dlt.table(
  table_properties = {"quality" : "bronze"},
  comment = "Materliazed view for customer bronze table",
  name = "customer_bronze"
)
def cust_bronze():
  df = spark.read.table("dev.bronze.customer_raw")
  return df


#@dlt.create_view(comment = "Materialized view for customers")
#def customers():
#  return spark.readStream.table("customers")
# create a view for products
#@dlt.create_view(comment = "View for products")
#def products():
#  return spark.readStream.table("products")
# create a view for sales
#@dlt.create_view(comment = "View for sales")
#def sales():
#  return spark.readStream.table("sales")

In [0]:
# create a viewto join orders with customers
@dlt.view(
    comment = "Materliazed view for customer bronze table"
)
def joined_vw():
  df_c = spark.read.table("LIVE.customer_bronze")
  df_o = spark.read.table("LIVE.orders_bronze")

  df_join = df_o.join(df_c, how = "left_outer", on = df_o.o_custkey == df_c.c_custkey)
  return df_join


In [0]:
# create MV to add new column
from pyspark.sql.functions import current_timestamp

@dlt.table(
  table_properties = {"quality" : "silver"},
  comment = "joined table",
  name = "joined_silver"
)
def joined_silver():
  df = spark.read.table("LIVE.joined_vw").withColumn("__insert_date", current_timestamp())
  return df

In [0]:
# Aggregate based on c_mktsegment and find the count of order (c_orderkey)
from pyspark.sql.functions import current_timestamp, count

@dlt.table(
  table_properties = {"quality" : "gold"},
  comment = "orders aggregated able"
)
def orders_agg_gold():
  df = spark.read.table("LIVE.joined_silver")

  df_final = df.groupBy("c_mktsegment").agg(count("o_orderkey").alias("sum_orders")).withColumn("__insert_date", current_timestamp())

  return df_final