Daily Revenue

In [0]:
from pyspark.sql import functions as F

# Answers: "How much revenue did we make each day?"

daily_revenue = spark.sql("""
    SELECT
        DATE(o.order_purchase_timestamp)    AS order_day,
        COUNT(DISTINCT o.order_id)          AS total_orders,
        COUNT(DISTINCT o.customer_id)       AS unique_customers,
        ROUND(SUM(oi.price), 2)             AS total_revenue,
        ROUND(SUM(oi.freight_value), 2)     AS total_freight,
        ROUND(SUM(oi.line_total), 2)        AS total_revenue_with_freight,
        ROUND(AVG(oi.price), 2)             AS avg_order_value,
        ROUND(MIN(oi.price), 2)             AS min_order_value,
        ROUND(MAX(oi.price), 2)             AS max_order_value,
        current_timestamp()                 AS updated_at
    FROM ecommerce.silver.orders o
    JOIN ecommerce.silver.order_items oi
        ON o.order_id = oi.order_id
    WHERE o.order_status = 'DELIVERED'
    AND o.order_purchase_timestamp IS NOT NULL
    GROUP BY DATE(o.order_purchase_timestamp)
    ORDER BY order_day
""")

(daily_revenue.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.gold.daily_revenue")
)

count = spark.table("ecommerce.gold.daily_revenue").count()
print(f"gold.daily_revenue: {count:,} records")
spark.table("ecommerce.gold.daily_revenue").show(5)

gold.daily_revenue: 612 records
+----------+------------+----------------+-------------+-------------+--------------------------+---------------+---------------+---------------+--------------------+
| order_day|total_orders|unique_customers|total_revenue|total_freight|total_revenue_with_freight|avg_order_value|min_order_value|max_order_value|          updated_at|
+----------+------------+----------------+-------------+-------------+--------------------------+---------------+---------------+---------------+--------------------+
|2016-09-15|           1|               1|       134.97|         8.49|                    143.46|          44.99|          44.99|          44.99|2026-02-20 03:41:...|
|2016-10-03|           7|               7|       441.98|       117.55|                    559.53|          63.14|           21.9|          128.9|2026-02-20 03:41:...|
|2016-10-04|          54|              54|      8595.89|      1225.53|                   9821.42|         136.44|            9.9|    

Customer LTV

In [0]:
# Answers: "Who are our most valuable customers?"

customer_ltv = spark.sql("""
    SELECT
        c.customer_unique_id,
        c.customer_city,
        c.customer_state,
        COUNT(DISTINCT o.order_id)              AS total_orders,
        ROUND(SUM(oi.price), 2)                 AS lifetime_value,
        ROUND(AVG(oi.price), 2)                 AS avg_order_value,
        ROUND(MAX(oi.price), 2)                 AS max_order_value,
        MIN(DATE(o.order_purchase_timestamp))   AS first_order_date,
        MAX(DATE(o.order_purchase_timestamp))   AS last_order_date,
        DATEDIFF(
            MAX(DATE(o.order_purchase_timestamp)),
            MIN(DATE(o.order_purchase_timestamp))
        )                                       AS customer_age_days,
        ROUND(AVG(r.review_score), 2)           AS avg_review_score,
        current_timestamp()                     AS updated_at
    FROM ecommerce.silver.customers c
    JOIN ecommerce.silver.orders o
        ON c.customer_id = o.customer_id
    JOIN ecommerce.silver.order_items oi
        ON o.order_id = oi.order_id
    LEFT JOIN ecommerce.silver.reviews r
        ON o.order_id = r.order_id
    WHERE o.order_status = 'DELIVERED'
    GROUP BY
        c.customer_unique_id,
        c.customer_city,
        c.customer_state
    ORDER BY lifetime_value DESC
""")

(customer_ltv.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.gold.customer_ltv")
)

count = spark.table("ecommerce.gold.customer_ltv").count()
print(f"gold.customer_ltv: {count:,} records")

# Show top 5 most valuable customers
print("\nTop 5 Most Valuable Customers:")
spark.table("ecommerce.gold.customer_ltv") \
    .select("customer_unique_id", "customer_city",
            "total_orders", "lifetime_value",
            "avg_review_score") \
    .show(5)

gold.customer_ltv: 93,471 records

Top 5 Most Valuable Customers:
+--------------------+--------------+------------+--------------+----------------+
|  customer_unique_id| customer_city|total_orders|lifetime_value|avg_review_score|
+--------------------+--------------+------------+--------------+----------------+
|0a0a92112bd4c708c...|Rio De Janeiro|           1|       13440.0|             1.0|
|da122df9eeddfedc1...|      Araruama|           2|        7388.0|             5.0|
|763c8b1c9c68a0229...|    Vila Velha|           1|        7160.0|             1.0|
|dc4802a71eae9be1d...|  Campo Grande|           1|        6735.0|             5.0|
|459bef486812aa252...|       Vitoria|           1|        6729.0|            NULL|
+--------------------+--------------+------------+--------------+----------------+
only showing top 5 rows


Product Performance

In [0]:
# Answers: "Which products sell the most?"

product_performance = spark.sql("""
    SELECT
        p.product_id,
        p.product_category_name,
        p.product_weight_g,
        p.product_volume_cm3,
        COUNT(DISTINCT oi.order_id)         AS total_orders,
        SUM(oi.order_item_id)               AS total_units_sold,
        ROUND(SUM(oi.price), 2)             AS total_revenue,
        ROUND(AVG(oi.price), 2)             AS avg_selling_price,
        ROUND(SUM(oi.freight_value), 2)     AS total_freight_cost,
        ROUND(AVG(r.review_score), 2)       AS avg_review_score,
        COUNT(DISTINCT oi.seller_id)        AS num_sellers,
        current_timestamp()                 AS updated_at
    FROM ecommerce.silver.products p
    JOIN ecommerce.silver.order_items oi
        ON p.product_id = oi.product_id
    JOIN ecommerce.silver.orders o
        ON oi.order_id = o.order_id
    LEFT JOIN ecommerce.silver.reviews r
        ON o.order_id = r.order_id
    WHERE o.order_status = 'DELIVERED'
    GROUP BY
        p.product_id,
        p.product_category_name,
        p.product_weight_g,
        p.product_volume_cm3
    ORDER BY total_revenue DESC
""")

(product_performance.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.gold.product_performance")
)

count = spark.table("ecommerce.gold.product_performance").count()
print(f"gold.product_performance: {count:,} records")

# Show top 5 best performing products
print("\nTop 5 Products by Revenue:")
spark.table("ecommerce.gold.product_performance") \
    .select("product_category_name", "total_orders",
            "total_units_sold", "total_revenue",
            "avg_review_score") \
    .show(5)

gold.product_performance: 32,216 records

Top 5 Products by Revenue:
+---------------------+------------+----------------+-------------+----------------+
|product_category_name|total_orders|total_units_sold|total_revenue|avg_review_score|
+---------------------+------------+----------------+-------------+----------------+
|         Beleza Saude|         186|             214|      63560.0|            4.24|
|         Beleza Saude|         148|             161|      53652.3|            4.35|
|                  Pcs|          33|              33|     45949.35|            4.67|
| Informatica Acess...|         313|             357|     45620.56|            4.28|
|      Cama Mesa Banho|         456|             534|     42226.46|            3.93|
+---------------------+------------+----------------+-------------+----------------+
only showing top 5 rows


Category Performance

In [0]:
# Answers: "Which product categories make the most money?"

category_performance = spark.sql("""
    SELECT
        p.product_category_name             AS category,
        COUNT(DISTINCT p.product_id)        AS total_products,
        COUNT(DISTINCT oi.order_id)         AS total_orders,
        COUNT(DISTINCT o.customer_id)       AS unique_customers,
        SUM(oi.order_item_id)               AS total_units_sold,
        ROUND(SUM(oi.price), 2)             AS total_revenue,
        ROUND(AVG(oi.price), 2)             AS avg_price,
        ROUND(SUM(oi.freight_value), 2)     AS total_freight,
        ROUND(AVG(r.review_score), 2)       AS avg_review_score,
        current_timestamp()                 AS updated_at
    FROM ecommerce.silver.products p
    JOIN ecommerce.silver.order_items oi
        ON p.product_id = oi.product_id
    JOIN ecommerce.silver.orders o
        ON oi.order_id = o.order_id
    LEFT JOIN ecommerce.silver.reviews r
        ON o.order_id = r.order_id
    WHERE o.order_status = 'DELIVERED'
    AND p.product_category_name IS NOT NULL
    GROUP BY p.product_category_name
    ORDER BY total_revenue DESC
""")

(category_performance.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.gold.category_performance")
)

count = spark.table("ecommerce.gold.category_performance").count()
print(f"gold.category_performance: {count:,} records")

print("\nTop 10 Categories by Revenue:")
spark.table("ecommerce.gold.category_performance") \
    .select("category", "total_orders",
            "total_revenue", "avg_price",
            "avg_review_score") \
    .show(10)

gold.category_performance: 73 records

Top 10 Categories by Revenue:
+--------------------+------------+-------------+---------+----------------+
|            category|total_orders|total_revenue|avg_price|avg_review_score|
+--------------------+------------+-------------+---------+----------------+
|        Beleza Saude|        8647|   1234195.12|   130.18|            4.19|
|  Relogios Presentes|        5495|   1166427.74|   198.88|            4.07|
|     Cama Mesa Banho|        9272|    1031836.6|    93.46|            3.92|
|       Esporte Lazer|        7530|    957262.35|   113.21|            4.17|
|Informatica Acess...|        6530|    891682.66|   116.26|            3.99|
|    Moveis Decoracao|        6307|    713794.42|    87.15|            3.96|
|Utilidades Domest...|        5743|    617376.49|    90.68|            4.11|
|          Cool Stuff|        3559|    610633.89|    164.1|             4.2|
|          Automotivo|        3810|     579341.7|   139.77|            4.12|
|  Ferr

Seller Performance

In [0]:
# Answers: "Which sellers are performing best?"

seller_performance = spark.sql("""
    SELECT
        s.seller_id,
        s.seller_city,
        s.seller_state,
        COUNT(DISTINCT oi.order_id)         AS total_orders,
        COUNT(DISTINCT o.customer_id)       AS unique_customers,
        SUM(oi.order_item_id)               AS total_units_sold,
        ROUND(SUM(oi.price), 2)             AS total_revenue,
        ROUND(AVG(oi.price), 2)             AS avg_price,
        ROUND(SUM(oi.freight_value), 2)     AS total_freight,
        ROUND(AVG(r.review_score), 2)       AS avg_review_score,
        current_timestamp()                 AS updated_at
    FROM ecommerce.silver.sellers s
    JOIN ecommerce.silver.order_items oi
        ON s.seller_id = oi.seller_id
    JOIN ecommerce.silver.orders o
        ON oi.order_id = o.order_id
    LEFT JOIN ecommerce.silver.reviews r
        ON o.order_id = r.order_id
    WHERE o.order_status = 'DELIVERED'
    GROUP BY
        s.seller_id,
        s.seller_city,
        s.seller_state
    ORDER BY total_revenue DESC
""")

(seller_performance.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.gold.seller_performance")
)

count = spark.table("ecommerce.gold.seller_performance").count()
print(f"gold.seller_performance: {count:,} records")

print("\nTop 5 Sellers by Revenue:")
spark.table("ecommerce.gold.seller_performance") \
    .select("seller_id", "seller_city",
            "total_orders", "total_revenue",
            "avg_review_score") \
    .show(5)

gold.seller_performance: 2,970 records

Top 5 Sellers by Revenue:
+--------------------+----------------+------------+-------------+----------------+
|           seller_id|     seller_city|total_orders|total_revenue|avg_review_score|
+--------------------+----------------+------------+-------------+----------------+
|4869f7a5dfa277a7d...|         Guariba|        1124|    226987.93|            4.14|
|53243585a1d6dc264...|Lauro De Freitas|         348|    217940.44|            4.13|
|4a3ca9315b744ce9f...|        Ibitinga|        1772|    198631.52|            3.83|
|fa1c13f2614d7b5c4...|          Sumare|         578|    190917.14|            4.37|
|7c67e1448b00f6e96...| Itaquaquecetuba|         973|    187366.92|            3.35|
+--------------------+----------------+------------+-------------+----------------+
only showing top 5 rows


Payment Analysis

In [0]:
# Answers: "How do customers prefer to pay?"

payment_analysis = spark.sql("""
    SELECT
        p.payment_type,
        COUNT(DISTINCT p.order_id)          AS total_orders,
        ROUND(SUM(p.payment_value), 2)      AS total_revenue,
        ROUND(AVG(p.payment_value), 2)      AS avg_payment_value,
        ROUND(AVG(p.payment_installments),2) AS avg_installments,
        MAX(p.payment_installments)         AS max_installments,
        current_timestamp()                 AS updated_at
    FROM ecommerce.silver.payments p
    JOIN ecommerce.silver.orders o
        ON p.order_id = o.order_id
    WHERE o.order_status = 'DELIVERED'
    GROUP BY p.payment_type
    ORDER BY total_revenue DESC
""")

(payment_analysis.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.gold.payment_analysis")
)

count = spark.table("ecommerce.gold.payment_analysis").count()
print(f"gold.payment_analysis: {count:,} records")
spark.table("ecommerce.gold.payment_analysis").show()

gold.payment_analysis: 4 records
+------------+------------+-------------+-----------------+----------------+----------------+--------------------+
|payment_type|total_orders|total_revenue|avg_payment_value|avg_installments|max_installments|          updated_at|
+------------+------------+-------------+-----------------+----------------+----------------+--------------------+
| CREDIT_CARD|       74304|1.210109488E7|           162.24|             3.5|              24|2026-02-20 03:52:...|
|      BOLETO|       19191|   2769932.58|           144.33|             1.0|               1|2026-02-20 03:52:...|
|     VOUCHER|        3679|    343013.19|            62.45|             1.0|               1|2026-02-20 03:52:...|
|  DEBIT_CARD|        1485|    208421.12|           140.26|             1.0|               1|2026-02-20 03:52:...|
+------------+------------+-------------+-----------------+----------------+----------------+--------------------+



In [0]:
print("=" * 55)
print("GOLD LAYER — FINAL VERIFICATION")
print("=" * 55)

gold_tables = {
    "daily_revenue":       "ecommerce.gold.daily_revenue",
    "customer_ltv":        "ecommerce.gold.customer_ltv",
    "product_performance": "ecommerce.gold.product_performance",
    "category_performance":"ecommerce.gold.category_performance",
    "seller_performance":  "ecommerce.gold.seller_performance",
    "payment_analysis":    "ecommerce.gold.payment_analysis",
}

for name, table in gold_tables.items():
    count = spark.table(table).count()
    print(f"{name:<25} {count:>10,} records")

# ── Key Business Metrics ─────────────────────────────────────────
print("\n" + "=" * 55)
print("KEY BUSINESS METRICS")
print("=" * 55)

metrics = spark.sql("""
    SELECT
        COUNT(DISTINCT order_day)           AS days_of_data,
        ROUND(SUM(total_revenue), 2)        AS overall_revenue,
        ROUND(AVG(total_revenue), 2)        AS avg_daily_revenue,
        ROUND(MAX(total_revenue), 2)        AS best_day_revenue,
        SUM(total_orders)                   AS overall_orders,
        SUM(unique_customers)               AS overall_customers
    FROM ecommerce.gold.daily_revenue
""")

metrics.show()


GOLD LAYER — FINAL VERIFICATION
daily_revenue                    612 records
customer_ltv                  93,471 records
product_performance           32,216 records
category_performance              73 records
seller_performance             2,970 records
payment_analysis                   4 records

KEY BUSINESS METRICS
+------------+---------------+-----------------+----------------+--------------+-----------------+
|days_of_data|overall_revenue|avg_daily_revenue|best_day_revenue|overall_orders|overall_customers|
+------------+---------------+-----------------+----------------+--------------+-----------------+
|         612|  1.322149811E7|         21603.76|       149916.58|         96478|            96478|
+------------+---------------+-----------------+----------------+--------------+-----------------+

