# Gold Layer Transformations

## Imports

In [None]:
# Import PySpark libraries
from pyspark.sql.types import *
from pyspark.sql.functions import *
from delta.tables import *

## Date Dimension - Shared Across All Facts

In [None]:
# Source dates from all Silver layer tables
dates_orders = spark.read.table("orders_header_silver").select(col("order_date").alias("date"))
dates_reviews = spark.read.table("reviews_silver").select(col("review_date").alias("date"))
dates_web = spark.read.table("web_logs_silver").select(col("event_date").alias("date"))

# Combine all date sources
df_all_dates = dates_orders.union(dates_reviews).union(dates_web)

# Build date dimension with attributes
df_dim_date_gold = df_all_dates.dropDuplicates(["date"]).select(
    col("date"),
    dayofmonth("date").alias("day"),
    month("date").alias("month"),
    year("date").alias("year"),
    date_format(col("date"), "MMM-yyyy").alias("mmmyyyy"),
    date_format(col("date"), "yyyyMM").alias("yyyymm")
).orderBy("date")

# Create dim_date_gold table if not exists
DeltaTable.createIfNotExists(spark) \
    .tableName("dim_date_gold") \
    .addColumn("date", DateType()) \
    .addColumn("day", IntegerType()) \
    .addColumn("month", IntegerType()) \
    .addColumn("year", IntegerType()) \
    .addColumn("mmmyyyy", StringType()) \
    .addColumn("yyyymm", StringType()) \
    .execute()

# Idempotent merge: Insert only new dates
deltaTable = DeltaTable.forPath(spark, 'Tables/dim_date_gold')

deltaTable.alias('target') \
    .merge(
        df_dim_date_gold.alias('source'),
        'target.date = source.date'
    ) \
    .whenNotMatchedInsertAll() \
    .execute()

## Customer Dimension - Type 1 from SCD2 Source

In [None]:
# Load current customer records from Silver layer
df_customers = spark.read.table("customers_silver").filter(col("is_active") == True)

# Build customer dimension with name parsing
df_dim_customer_gold = df_customers.dropDuplicates(["customer_id", "email"]).select(
    col("customer_id"),
    col("customer_name"),
    col("email")
).withColumn("first", split(col("customer_name"), " ").getItem(0)) \
 .withColumn("last", split(col("customer_name"), " ").getItem(1))

# Create dim_customer_gold table if not exists
DeltaTable.createIfNotExists(spark) \
    .tableName("dim_customer_gold") \
    .addColumn("customer_id", StringType()) \
    .addColumn("customer_name", StringType()) \
    .addColumn("email", StringType()) \
    .addColumn("first", StringType()) \
    .addColumn("last", StringType()) \
    .execute()

# Idempotent merge: Update customer attributes or insert new customers
deltaTable = DeltaTable.forPath(spark, 'Tables/dim_customer_gold')

deltaTable.alias('target') \
    .merge(
        df_dim_customer_gold.alias('source'),
        'target.customer_id = source.customer_id'
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

## Product Dimension - Type 1 from SCD2 Source

In [None]:
# Load current product records from Silver layer
df_products = spark.read.table("products_silver").filter(col("is_active") == True)

# Build product dimension
df_dim_product_gold = df_products.dropDuplicates(["product_id"]).select(
    col("product_id"),
    col("product_name"), 
    col("category"),
    col("unit_price"),
    col("sales_price"),
    col("is_low_stock")
)

# Create dim_product_gold table if not exists
DeltaTable.createIfNotExists(spark) \
    .tableName("dim_product_gold") \
    .addColumn("product_id", StringType()) \
    .addColumn("product_name", StringType()) \
    .addColumn("category", StringType()) \
    .addColumn("unit_price", DecimalType(10, 2)) \
    .addColumn("sales_price", DecimalType(10, 2)) \
    .addColumn("is_low_stock", BooleanType()) \
    .execute()

# Idempotent merge: Update product attributes or insert new products
deltaTable = DeltaTable.forPath(spark, 'Tables/dim_product_gold')

deltaTable.alias('target') \
    .merge(
        df_dim_product_gold.alias('source'),
        'target.product_id = source.product_id'
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

## Fact Sales - Transactional Sales Data

In [None]:
# Load source data from Silver layer
df_order_lines = spark.read.table("order_lines_silver")
df_order_header = spark.read.table("orders_header_silver") 
df_products = spark.read.table("products_silver").filter(col("is_active") == True)
df_customers = spark.read.table("customers_silver").filter(col("is_active") == True)

# Build fact table by joining sales transactions with dimensions
df_fact_sales = df_order_lines.alias("ol") \
    .join(df_order_header.alias("oh"), col("ol.order_id") == col("oh.order_id"), "left") \
    .select(
        col("ol.order_line_id"),
        col("ol.order_id"), 
        col("oh.customer_id"),
        col("ol.product_id"),
        col("oh.order_date"),
        col("ol.quantity"),
        col("ol.unit_price"), 
        col("ol.discount_amount"),
        col("ol.tax_amount"),
        col("ol.line_total"),
        col("oh.payment_method")
    )


# Create fact_sales_gold table if not exists
DeltaTable.createIfNotExists(spark) \
    .tableName("fact_sales_gold") \
    .addColumn("order_line_id", StringType()) \
    .addColumn("order_id", StringType()) \
    .addColumn("customer_id", StringType()) \
    .addColumn("product_id", StringType()) \
    .addColumn("order_date", DateType()) \
    .addColumn("quantity", IntegerType()) \
    .addColumn("unit_price", DecimalType(10, 2)) \
    .addColumn("discount_amount", DecimalType(10, 2)) \
    .addColumn("tax_amount", DecimalType(10, 2)) \
    .addColumn("line_total", DecimalType(12, 2)) \
    .addColumn("payment_method", StringType()) \
    .execute()

# Idempotent merge: Insert only new sales transactions
deltaTable = DeltaTable.forPath(spark, 'Tables/fact_sales_gold')

deltaTable.alias('target') \
    .merge(
        df_fact_sales.alias('source'),
        'target.order_line_id = source.order_line_id'
    ) \
    .whenNotMatchedInsertAll() \
    .execute()

## Fact Reviews - Customer Review Events

In [None]:
# Load reviews from Silver layer
df_reviews_silver = spark.read.table("reviews_silver")

# Transform to fact table
df_fact_reviews = df_reviews_silver.select(
    col("customer_id"),
    col("product_id"),
    col("review_date").alias("date"),
    col("rating"),
    col("review_length"),
    col("is_positive_review"),
    col("review_text_clean"),
    col("timestamp")
)

# Create fact_reviews table if not exists
DeltaTable.createIfNotExists(spark) \
    .tableName("fact_reviews") \
    .addColumn("customer_id", StringType()) \
    .addColumn("product_id", StringType()) \
    .addColumn("date", DateType()) \
    .addColumn("rating", IntegerType()) \
    .addColumn("review_length", IntegerType()) \
    .addColumn("is_positive_review", BooleanType()) \
    .addColumn("review_text_clean", StringType()) \
    .addColumn("timestamp", TimestampType()) \
    .execute()

# Idempotent merge: Prevent duplicate reviews
deltaTable = DeltaTable.forPath(spark, 'Tables/fact_reviews')

deltaTable.alias('target') \
    .merge(
        df_fact_reviews.alias('source'),
        'target.customer_id = source.customer_id AND target.product_id = source.product_id AND target.timestamp = source.timestamp'
    ) \
    .whenNotMatchedInsertAll() \
    .execute()

## Fact Web Events - Customer Behavior Events

In [None]:
# Load web logs from Silver layer
df_web_logs_silver = spark.read.table("web_logs_silver")

# Transform to fact table
df_fact_web_events = df_web_logs_silver.select(
    col("customer_id"),
    col("event_date").alias("date"),
    col("page"),
    col("page_type"),
    col("page_category"),
    col("action"),
    col("is_bot_user"),
    col("is_conversion_action"),
    col("is_engagement_action"),
    col("timestamp")
)

# Create fact_web_events table if not exists
DeltaTable.createIfNotExists(spark) \
    .tableName("fact_web_events") \
    .addColumn("customer_id", StringType()) \
    .addColumn("date", DateType()) \
    .addColumn("page", StringType()) \
    .addColumn("page_type", StringType()) \
    .addColumn("page_category", StringType()) \
    .addColumn("action", StringType()) \
    .addColumn("is_bot_user", BooleanType()) \
    .addColumn("is_conversion_action", BooleanType()) \
    .addColumn("is_engagement_action", BooleanType()) \
    .addColumn("timestamp", TimestampType()) \
    .execute()

# Idempotent merge: Prevent duplicate events
deltaTable = DeltaTable.forPath(spark, 'Tables/fact_web_events')

deltaTable.alias('target') \
    .merge(
        df_fact_web_events.alias('source'),
        'target.customer_id = source.customer_id AND target.timestamp = source.timestamp'
    ) \
    .whenNotMatchedInsertAll() \
    .execute()