In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

# The following code has Decorators which can not be run on a Compute but as a Job.
# Following 3 commands are representing improvisation of Delta Tables from Bronze to Gold levels.

# Define a path to the data source files in a python variable.
json_path = "/databricks-datasets/wikipedia-datasets/data-001/clickstream/raw-uncompressed-json/2015_2_clickstream.json"
# The 'dlt' is a decorater of Python.
# The dlt.table is like a create table function. 
# It may define comment, table name and so on. 
# It must be followed by a function which returns a data frame from which the dlt.table will create a table. 
# The name of the delta table will be name of a function 'clickstream_raw' if not defined explicitly.
# Note this delta table is representing a Bronze Table
@dlt.table(   # The @dlt.create_table() also works.
  comment="The raw wikipedia clickstream dataset, ingested from /databricks-datasets."
)
def clickstream_raw():
  return (spark.read.format("json").load(json_path))


# Create a new Silver delta table from a Bronze delta table defined in earlier step. 
# Find here, two new decoraters- @dlt.expect() and @dlt.expect_or_fail to define the expectation. Multiple @dlt.expect decoraters are applicable.
# The expect decorators are applied while data enters into a target data frame and they are to measure the data quality.
# Also observe, dlt.read() to read data from a Delta Table (spark.read() reads a data from a file).
# The dlt.read() is also creating a dependency to show that clickstream_raw() must be executed first to run clickstream_prepared().
# Note this delta table is representing a Silver Table.
@dlt.table(
  comment="Wikipedia clickstream data cleaned and prepared for analysis."
)
@dlt.expect("valid_current_page_title", "current_page_title IS NOT NULL")
@dlt.expect_or_fail("valid_count", "click_count > 0")
def clickstream_prepared():
  return (
    dlt.read("clickstream_raw")
      .withColumn("click_count", expr("CAST(n AS INT)"))
      .withColumnRenamed("curr_title", "current_page_title")
      .withColumnRenamed("prev_title", "previous_page_title")
      .select("current_page_title", "click_count", "previous_page_title")
  )

# Create a new Gold delta table from a Silver Delta table defined in earlier step.
# The dlt.read() is defining a dependency on Silver Table
@dlt.table(
  comment="A table containing the top pages linking to the Apache Spark page."
)
def top_spark_referrers():
  return (
    dlt.read("clickstream_prepared")
      .filter(expr("current_page_title == 'Apache_Spark'"))
      .withColumnRenamed("previous_page_title", "referrer")
      .sort(desc("click_count"))
      .select("referrer", "click_count")
      .limit(10)
  )

message
"This Delta Live Tables query is syntactically valid, but you must create a pipeline in order to define and populate your table."
