In [3]:
# Load data from Silver layer to create date dimension
from pyspark.sql.types import *
from pyspark.sql.functions import col, dayofmonth, month, year, date_format, split, monotonically_increasing_id, col, when, coalesce, max, lit
from delta.tables import *


StatementMeta(, 99ae979d-73d2-4932-b521-d7cdb7cc97b5, 5, Finished, Available, Finished)

## Create Date Dimension, popuated using order_date.

In [2]:
# Load data from Silver layer orders to extract unique dates
df = spark.read.table("orders_header_silver")

# Create dataframe for dimDate_gold by extracting unique dates and adding date attributes
df_dim_date_gold = df.dropDuplicates(["order_date"]).select(col("order_date"), \
    dayofmonth("order_date").alias("day"), \
    month("order_date").alias("month"), \
    year("order_date").alias("year"), \
    date_format(col("order_date"), "MMM-yyyy").alias("mmmyyyy"), \
    date_format(col("order_date"), "yyyyMM").alias("yyyymm"), \
).orderBy("order_date")

# Display the first 10 rows of the dataframe to preview your data
display(df_dim_date_gold.head(10))

# Define the schema for the dimDate_gold table
DeltaTable.createIfNotExists(spark) \
    .tableName("dim_date_gold") \
    .addColumn("order_date", DateType()) \
    .addColumn("day", IntegerType()) \
    .addColumn("month", IntegerType()) \
    .addColumn("year", IntegerType()) \
    .addColumn("mmmyyyy", StringType()) \
    .addColumn("yyyymm", StringType()) \
    .execute()

# Update existing records and insert new ones as new data comes in
deltaTable = DeltaTable.forPath(spark, 'Tables/dim_date_gold')
dfUpdates = df_dim_date_gold

deltaTable.alias('gold') \
    .merge(
        dfUpdates.alias('updates'),
        'gold.order_date = updates.order_date'
    ) \
    .whenMatchedUpdate(set = {
    }) \
    .whenNotMatchedInsert(values = {
        "order_date": "updates.order_date",
        "day": "updates.day",
        "month": "updates.month",
        "year": "updates.year",
        "mmmyyyy": "updates.mmmyyyy",
        "yyyymm": "updates.yyyymm"
    }) \
    .execute()
    

StatementMeta(, f564dd49-baf3-41c1-87c0-f548adec8775, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, cb4c03e5-3b50-4af6-82db-d6e1873da3fa)

## Create customer dimension

In [3]:
# Load data from Silver layer customers - only current records for SCD2
df = spark.read.table("customers_silver").filter(col("is_active") == True)

# Create customer_silver dataframe using existing customer_id
df_dim_customer_silver = df.dropDuplicates(["customer_id","email"]).select(col("customer_id"),col("customer_name"),col("email")) \
    .withColumn("first",split(col("customer_name"), " ").getItem(0)) \
    .withColumn("last",split(col("customer_name"), " ").getItem(1))

# Display the first 10 rows of the dataframe to preview your data
display(df_dim_customer_silver.head(10))

# Use existing customer_id - no need for surrogate key generation
df_dim_customer_gold = df_dim_customer_silver

# Display the first 10 rows of the dataframe to preview your data
display(df_dim_customer_gold.head(10))

# Define the schema for the dimCustomer_gold table
DeltaTable.createIfNotExists(spark) \
    .tableName("dim_customer_gold") \
    .addColumn("customer_id", StringType()) \
    .addColumn("customer_name", StringType()) \
    .addColumn("first", StringType()) \
    .addColumn("last", StringType()) \
    .addColumn("email", StringType()) \
    .execute()

# Ensure that customer table remains up-to-date as new data comes in
deltaTable = DeltaTable.forPath(spark, 'Tables/dim_customer_gold')
dfUpdates = df_dim_customer_gold

deltaTable.alias('gold') \
    .merge(
        dfUpdates.alias('updates'),
        'gold.customer_id = updates.customer_id'
    ) \
    .whenMatchedUpdate(set = {
    }) \
    .whenNotMatchedInsert(values = {
        "customer_id": "updates.customer_id",
        "customer_name": "updates.customer_name",
        "first": "updates.first",
        "last": "updates.last",
        "email": "updates.email"
    }) \
    .execute()
    

StatementMeta(, f564dd49-baf3-41c1-87c0-f548adec8775, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 090d44d4-4aa7-4efc-88a9-844df35e61b6)

SynapseWidget(Synapse.DataFrame, e3b8518b-0fde-43e2-ad4b-bab3958f0970)

## Create products dimension

In [2]:
# Load data from Silver layer products - only current records for SCD2
df = spark.read.table("products_silver").filter(col("is_active") == True)

# Create product dimension dataframe using existing product_id
df_dim_product_gold = df.dropDuplicates(["product_id"]).select(
    col("product_id"),
    col("product_name"), 
    col("category"),
    col("unit_price"),
    col("sales_price"),
    col("is_low_stock")
)

# Display the first 10 rows of the dataframe to preview your data
display(df_dim_product_gold.head(10))

# Define the schema for the dim_product_gold table
DeltaTable.createIfNotExists(spark) \
    .tableName("dim_product_gold") \
    .addColumn("product_id", StringType()) \
    .addColumn("product_name", StringType()) \
    .addColumn("category", StringType()) \
    .addColumn("unit_price", DecimalType(10, 2)) \
    .addColumn("sales_price", DecimalType(10, 2)) \
    .addColumn("is_low_stock", BooleanType()) \
    .execute()

# Ensure that product table remains up-to-date as new data comes in
deltaTable = DeltaTable.forPath(spark, 'Tables/dim_product_gold')
dfUpdates = df_dim_product_gold

deltaTable.alias('gold') \
    .merge(
        dfUpdates.alias('updates'),
        'gold.product_id = updates.product_id'
    ) \
    .whenMatchedUpdate(set = {
    }) \
    .whenNotMatchedInsert(values = {
        "product_id": "updates.product_id",
        "product_name": "updates.product_name",
        "category": "updates.category", 
        "unit_price": "updates.unit_price",
        "sales_price": "updates.sales_price",
        "is_low_stock": "updates.is_low_stock"
    }) \
    .execute()
    

StatementMeta(, 99ae979d-73d2-4932-b521-d7cdb7cc97b5, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 33ba032b-a143-42bc-b9f7-c75a6594b028)

## Create sales fact table

In [6]:
# Load data from Silver layer tables
df_order_lines = spark.read.table("order_lines_silver")
df_order_header = spark.read.table("orders_header_silver") 
df_products = spark.read.table("products_silver").filter(col("is_active") == True)
df_customers = spark.read.table("customers_silver").filter(col("is_active") == True)

# Create fact sales dataframe by joining all tables
df_fact_sales = df_order_lines.alias("ol") \
    .join(df_order_header.alias("oh"), col("ol.order_id") == col("oh.order_id"), "left") \
    .join(df_products.alias("p"), col("ol.product_id") == col("p.product_id"), "left") \
    .join(df_customers.alias("c"), col("oh.customer_id") == col("c.customer_id"), "left") \
    .select(
        # Foreign keys for dimensional joins
        col("ol.order_line_id"),
        col("ol.order_id"), 
        col("oh.customer_id"),
        col("ol.product_id"),
        col("oh.order_date"),
        
        # Measures from order lines
        col("ol.quantity"),
        col("ol.unit_price"), 
        col("ol.discount_amount"),
        col("ol.tax_amount"),
        col("ol.line_total"),
        
        # Measures from products (for profit calculation)
        col("p.unit_price").alias("product_cost"),
        col("p.sales_price").alias("product_sales_price"),
        
        # Order header attributes
        col("oh.payment_method"),
        col("oh.order_total")
    ) \
    .withColumn("gross_revenue", col("line_total") + col("discount_amount")) \
    .withColumn("total_cost", col("quantity") * col("product_cost")) \
    .withColumn("list_price_revenue", col("quantity") * col("product_sales_price")) \
    .withColumn("total_profit", col("line_total") - (col("quantity") * col("product_cost"))) \
    .withColumn("list_price_profit", col("list_price_revenue") - (col("quantity") * col("product_cost"))) \
    .withColumn("profit_margin", 
        when(col("line_total") > 0, 
             (col("total_profit") / col("line_total")) * 100)
        .otherwise(lit(0.0))) \
    .withColumn("list_price_margin", 
        when(col("list_price_revenue") > 0, 
             (col("list_price_profit") / col("list_price_revenue")) * 100)
        .otherwise(lit(0.0)))

# Display the first 10 rows to preview data
display(df_fact_sales.head(10))

# Define the schema for the fact_sales_gold table
DeltaTable.createIfNotExists(spark) \
    .tableName("fact_sales_gold") \
    .addColumn("order_line_id", StringType()) \
    .addColumn("order_id", StringType()) \
    .addColumn("customer_id", StringType()) \
    .addColumn("product_id", StringType()) \
    .addColumn("order_date", DateType()) \
    .addColumn("quantity", IntegerType()) \
    .addColumn("unit_price", DecimalType(10, 2)) \
    .addColumn("discount_amount", DecimalType(10, 2)) \
    .addColumn("tax_amount", DecimalType(10, 2)) \
    .addColumn("line_total", DecimalType(12, 2)) \
    .addColumn("product_cost", DecimalType(10, 2)) \
    .addColumn("product_sales_price", DecimalType(10, 2)) \
    .addColumn("payment_method", StringType()) \
    .addColumn("order_total", DecimalType(12, 2)) \
    .addColumn("gross_revenue", DecimalType(12, 2)) \
    .addColumn("total_cost", DecimalType(12, 2)) \
    .addColumn("list_price_revenue", DecimalType(12, 2)) \
    .addColumn("total_profit", DecimalType(12, 2)) \
    .addColumn("list_price_profit", DecimalType(12, 2)) \
    .addColumn("profit_margin", DecimalType(5, 2)) \
    .addColumn("list_price_margin", DecimalType(5, 2)) \
    .execute()

# Update existing records and insert new ones
deltaTable = DeltaTable.forPath(spark, 'Tables/fact_sales_gold')
dfUpdates = df_fact_sales

deltaTable.alias('gold') \
    .merge(
        dfUpdates.alias('updates'),
        'gold.order_line_id = updates.order_line_id'
    ) \
    .whenMatchedUpdate(set = {
    }) \
    .whenNotMatchedInsert(values = {
        "order_line_id": "updates.order_line_id",
        "order_id": "updates.order_id",
        "customer_id": "updates.customer_id", 
        "product_id": "updates.product_id",
        "order_date": "updates.order_date",
        "quantity": "updates.quantity",
        "unit_price": "updates.unit_price",
        "discount_amount": "updates.discount_amount", 
        "tax_amount": "updates.tax_amount",
        "line_total": "updates.line_total",
        "product_cost": "updates.product_cost",
        "product_sales_price": "updates.product_sales_price",
        "payment_method": "updates.payment_method",
        "order_total": "updates.order_total",
        "gross_revenue": "updates.gross_revenue",
        "total_cost": "updates.total_cost",
        "list_price_revenue": "updates.list_price_revenue", 
        "total_profit": "updates.total_profit",
        "list_price_profit": "updates.list_price_profit",
        "profit_margin": "updates.profit_margin",
        "list_price_margin": "updates.list_price_margin"
    }) \
    .execute()


StatementMeta(, 99ae979d-73d2-4932-b521-d7cdb7cc97b5, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 36f3b8a9-5e45-40cf-9c14-7dccf9c2914c)