In [0]:
import pyspark.sql.functions as F
catalog_name='olist'

### Load Silver Data

In [0]:
order_fact = spark.table(f"{catalog_name}.silver.slv_order_fact")

### Core Business Metrics

In [0]:
# Daily Revenue
daily_revenue = (
    order_fact
    .groupBy("order_purchase_date")
    .agg(
        F.sum("total_payment_value").alias("daily_revenue"),
        F.countDistinct("order_id").alias("total_orders")
    )
)

In [0]:
# Monthly Revenue
monthly_revenue = (
    order_fact
    .withColumn("year_month", F.date_format("order_purchase_date", "yyyy-MM"))
    .groupBy("year_month")
    .agg(
        F.sum("total_payment_value").alias("monthly_revenue")
    )
)

### Product Analytics

In [0]:
# Top Products
top_products = (
    order_fact
    .groupBy("product_category_name_english")
    .agg(
        F.sum("total_payment_value").alias("revenue"),
        F.countDistinct("order_id").alias("orders")
    )
)

### Customer Level Metrics

In [0]:
customer_metrics = (
    order_fact
    .groupBy("customer_unique_id")
    .agg(
        F.countDistinct("order_id").alias("total_orders"),
        F.sum("total_payment_value").alias("total_spend"),
        F.avg("total_payment_value").alias("avg_order_value"),
        F.avg("avg_review_score").alias("avg_review_score"),
        F.avg("delivery_days").alias("avg_delivery_days"),
        F.avg("delivery_delay_days").alias("avg_delivery_delay"),
        F.max("order_purchase_date").alias("last_purchase_date")
    )
)

### RFM(Recency, Frequency, Monetary) Metrics

In [0]:
reference_date = order_fact.select(
    F.max("order_purchase_date")
).collect()[0][0]

rfm = (
    customer_metrics
    .withColumn(
        "recency_days",
        F.datediff(F.lit(reference_date), F.col("last_purchase_date"))
    )
    .withColumn("frequency", F.col("total_orders"))
    .withColumn("monetary", F.col("total_spend"))
)

### High-Value Customer Label

In [0]:
# 75th Percentile Spend
spend_threshold = rfm.approxQuantile(
    "monetary",
    [0.75],
    0.01
)[0]

In [0]:
# Create Label
rfm_labeled = rfm.withColumn(
    "high_value_customer",
    F.when(F.col("monetary") >= spend_threshold, 1).otherwise(0)
)

### ML Training Dataset

In [0]:
ml_features = rfm_labeled.select(
    "customer_unique_id",
    "recency_days",
    "frequency",
    "monetary",
    "avg_order_value",
    "avg_review_score",
    "avg_delivery_days",
    "avg_delivery_delay",
    "high_value_customer"
)

### Save Gold Tables

In [0]:
daily_revenue.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.gold.gld_daily_revenue")

monthly_revenue.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.gold.gld_monthly_revenue")

customer_metrics.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.gold.gld_customer_metrics")

ml_features.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.gold.gld_ml_features")