Orders

In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Read from Bronze
bronze_orders = spark.table("ecommerce.bronze.raw_orders")

# Transform
silver_orders = (
    bronze_orders
    
    # ── Remove duplicates ───────────────────────────────────────
    .dropDuplicates(["order_id"])
    
    # ── Filter out records with no order_id ─────────────────────
    .filter(F.col("order_id").isNotNull())
    
    # ── Standardize status to UPPERCASE ─────────────────────────
    .withColumn("order_status", F.upper(F.trim(F.col("order_status"))))
    
    # ── Cast date strings to TIMESTAMP ──────────────────────────
    .withColumn("order_purchase_timestamp",
        F.to_timestamp("order_purchase_timestamp", "yyyy-MM-dd HH:mm:ss"))
    .withColumn("order_approved_at",
        F.to_timestamp("order_approved_at", "yyyy-MM-dd HH:mm:ss"))
    .withColumn("order_delivered_carrier_date",
        F.to_timestamp("order_delivered_carrier_date", "yyyy-MM-dd HH:mm:ss"))
    .withColumn("order_delivered_customer_date",
        F.to_timestamp("order_delivered_customer_date", "yyyy-MM-dd HH:mm:ss"))
    .withColumn("order_estimated_delivery_date",
        F.to_timestamp("order_estimated_delivery_date", "yyyy-MM-dd HH:mm:ss"))
    
    # ── Add delivery delay column (business insight) ─────────────
    .withColumn("delivery_delay_days",
        F.when(
            F.col("order_delivered_customer_date").isNotNull(),
            F.datediff(
                F.col("order_delivered_customer_date"),
                F.col("order_estimated_delivery_date")
            )
        ).otherwise(None)
    )
    
    # ── Add updated_at timestamp ─────────────────────────────────
    .withColumn("updated_at", F.current_timestamp())
    
    # ── Drop Bronze metadata columns ─────────────────────────────
    .drop("_ingestion_time", "_source_file")
)

# Write to Silver
(silver_orders.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.silver.orders")
)

count = spark.table("ecommerce.silver.orders").count()
print(f"silver.orders: {count:,} records")

silver.orders: 99,441 records


Customers

In [0]:
bronze_customers = spark.table("ecommerce.bronze.raw_customers")

silver_customers = (
    bronze_customers

    # ── Remove duplicates ───────────────────────────────────────
    .dropDuplicates(["customer_id"])

    # ── Filter nulls ────────────────────────────────────────────
    .filter(F.col("customer_id").isNotNull())

    # ── Standardize city and state ───────────────────────────────
    .withColumn("customer_city",
        F.initcap(F.trim(F.col("customer_city"))))   # "sao paulo" → "Sao Paulo"
    .withColumn("customer_state",
        F.upper(F.trim(F.col("customer_state"))))    # "sp" → "SP"

    # ── Cast zip code to integer ─────────────────────────────────
    .withColumn("customer_zip_code_prefix",
        F.col("customer_zip_code_prefix").cast("integer"))

    # ── Add updated_at ───────────────────────────────────────────
    .withColumn("updated_at", F.current_timestamp())

    # ── Drop Bronze metadata ─────────────────────────────────────
    .drop("_ingestion_time", "_source_file")
)

(silver_customers.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.silver.customers")
)

count = spark.table("ecommerce.silver.customers").count()
print(f"silver.customers: {count:,} records")

silver.customers: 99,441 records


Products

In [0]:
bronze_products = spark.table("ecommerce.bronze.raw_products")

silver_products = (
    bronze_products

    # ── Remove duplicates ───────────────────────────────────────
    .dropDuplicates(["product_id"])

    # ── Filter nulls ────────────────────────────────────────────
    .filter(F.col("product_id").isNotNull())

    # ── Standardize category name ────────────────────────────────
    .withColumn("product_category_name",
        F.initcap(F.trim(
            F.regexp_replace("product_category_name", "_", " ")
        ))
    )   # "telefonia_fixa" → "Telefonia Fixa"

    # ── Cast numeric columns ─────────────────────────────────────
    .withColumn("product_name_lenght",
        F.col("product_name_lenght").cast("integer"))
    .withColumn("product_description_lenght",
        F.col("product_description_lenght").cast("integer"))
    .withColumn("product_photos_qty",
        F.col("product_photos_qty").cast("integer"))
    .withColumn("product_weight_g",
        F.col("product_weight_g").cast("double"))
    .withColumn("product_length_cm",
        F.col("product_length_cm").cast("double"))
    .withColumn("product_height_cm",
        F.col("product_height_cm").cast("double"))
    .withColumn("product_width_cm",
        F.col("product_width_cm").cast("double"))

    # ── Calculate product volume (business insight) ───────────────
    .withColumn("product_volume_cm3",
        F.col("product_length_cm") *
        F.col("product_height_cm") *
        F.col("product_width_cm")
    )

    # ── Add updated_at ───────────────────────────────────────────
    .withColumn("updated_at", F.current_timestamp())

    # ── Drop Bronze metadata ─────────────────────────────────────
    .drop("_ingestion_time", "_source_file")
)

(silver_products.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.silver.products")
)

count = spark.table("ecommerce.silver.products").count()
print(f"silver.products: {count:,} records")

silver.products: 32,951 records


Order Items

In [0]:
bronze_items = spark.table("ecommerce.bronze.raw_order_items")

silver_order_items = (
    bronze_items

    # ── Remove duplicates ───────────────────────────────────────
    .dropDuplicates(["order_id", "order_item_id"])

    # ── Filter nulls on key columns ──────────────────────────────
    .filter(F.col("order_id").isNotNull())
    .filter(F.col("product_id").isNotNull())

    # ── Cast numeric columns ─────────────────────────────────────
    .withColumn("order_item_id",
        F.col("order_item_id").cast("integer"))
    .withColumn("price",
        F.col("price").cast("double"))
    .withColumn("freight_value",
        F.col("freight_value").cast("double"))

    # ── Cast date ────────────────────────────────────────────────
    .withColumn("shipping_limit_date",
        F.to_timestamp("shipping_limit_date", "yyyy-MM-dd HH:mm:ss"))

    # ── Calculate line total (price + freight) ────────────────────
    .withColumn("line_total",
        F.round(F.col("price") + F.col("freight_value"), 2))

    # ── Add updated_at ───────────────────────────────────────────
    .withColumn("updated_at", F.current_timestamp())

    # ── Drop Bronze metadata ─────────────────────────────────────
    .drop("_ingestion_time", "_source_file")
)

(silver_order_items.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.silver.order_items")
)

count = spark.table("ecommerce.silver.order_items").count()
print(f"silver.order_items: {count:,} records")

silver.order_items: 112,650 records


Payments

In [0]:
bronze_payments = spark.table("ecommerce.bronze.raw_payments")

silver_payments = (
    bronze_payments

    # ── Remove duplicates ───────────────────────────────────────
    .dropDuplicates(["order_id", "payment_sequential"])

    # ── Filter nulls ────────────────────────────────────────────
    .filter(F.col("order_id").isNotNull())

    # ── Filter out not_defined payment types ─────────────────────
    .filter(F.col("payment_type") != "not_defined")

    # ── Standardize payment type ─────────────────────────────────
    .withColumn("payment_type",
        F.upper(F.trim(F.col("payment_type"))))

    # ── Cast numeric columns ─────────────────────────────────────
    .withColumn("payment_sequential",
        F.col("payment_sequential").cast("integer"))
    .withColumn("payment_installments",
        F.col("payment_installments").cast("integer"))
    .withColumn("payment_value",
        F.col("payment_value").cast("double"))

    # ── Add updated_at ───────────────────────────────────────────
    .withColumn("updated_at", F.current_timestamp())

    # ── Drop Bronze metadata ─────────────────────────────────────
    .drop("_ingestion_time", "_source_file")
)

(silver_payments.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.silver.payments")
)

count = spark.table("ecommerce.silver.payments").count()
print(f"silver.payments: {count:,} records")

silver.payments: 103,883 records


Reviews

In [0]:
bronze_reviews = spark.table("ecommerce.bronze.raw_reviews")

silver_reviews = (
    bronze_reviews

    # ── Remove duplicates ───────────────────────────────────────
    .dropDuplicates(["review_id"])

    # ── Filter nulls ────────────────────────────────────────────
    .filter(F.col("review_id").isNotNull())
    .filter(F.col("order_id").isNotNull())

    # ── Cast review score to integer ─────────────────────────────
    .withColumn("review_score",
        F.col("review_score").cast("integer"))

    # ── Cast dates ───────────────────────────────────────────────
    .withColumn("review_creation_date",
        F.to_timestamp("review_creation_date", "yyyy-MM-dd HH:mm:ss"))
    .withColumn("review_answer_timestamp",
        F.to_timestamp("review_answer_timestamp", "yyyy-MM-dd HH:mm:ss"))

    # ── Clean comment text ───────────────────────────────────────
    .withColumn("review_comment_title",
        F.when(F.col("review_comment_title").isNull(), "No Title")
         .otherwise(F.trim(F.col("review_comment_title"))))
    .withColumn("review_comment_message",
        F.when(F.col("review_comment_message").isNull(), "No Comment")
         .otherwise(F.trim(F.col("review_comment_message"))))

    # ── Add sentiment label based on score ───────────────────────
    .withColumn("sentiment",
        F.when(F.col("review_score") >= 4, "POSITIVE")
         .when(F.col("review_score") == 3, "NEUTRAL")
         .otherwise("NEGATIVE")
    )

    # ── Add updated_at ───────────────────────────────────────────
    .withColumn("updated_at", F.current_timestamp())

    # ── Drop Bronze metadata ─────────────────────────────────────
    .drop("_ingestion_time", "_source_file")
)

(silver_reviews.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.silver.reviews")
)

count = spark.table("ecommerce.silver.reviews").count()
print(f"silver.reviews: {count:,} records")

silver.reviews: 98,410 records


Sellers

In [0]:
bronze_sellers = spark.table("ecommerce.bronze.raw_sellers")

silver_sellers = (
    bronze_sellers

    # ── Remove duplicates ───────────────────────────────────────
    .dropDuplicates(["seller_id"])

    # ── Filter nulls ────────────────────────────────────────────
    .filter(F.col("seller_id").isNotNull())

    # ── Standardize city and state ───────────────────────────────
    .withColumn("seller_city",
        F.initcap(F.trim(F.col("seller_city"))))
    .withColumn("seller_state",
        F.upper(F.trim(F.col("seller_state"))))

    # ── Cast zip code ────────────────────────────────────────────
    .withColumn("seller_zip_code_prefix",
        F.col("seller_zip_code_prefix").cast("integer"))

    # ── Add updated_at ───────────────────────────────────────────
    .withColumn("updated_at", F.current_timestamp())

    # ── Drop Bronze metadata ─────────────────────────────────────
    .drop("_ingestion_time", "_source_file")
)

(silver_sellers.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("ecommerce.silver.sellers")
)

count = spark.table("ecommerce.silver.sellers").count()
print(f"silver.sellers: {count:,} records")

silver.sellers: 3,095 records


In [0]:
print("=" * 55)
print("SILVER LAYER — FINAL VERIFICATION")
print("=" * 55)

tables = {
    "orders":      "ecommerce.silver.orders",
    "customers":   "ecommerce.silver.customers",
    "products":    "ecommerce.silver.products",
    "order_items": "ecommerce.silver.order_items",
    "payments":    "ecommerce.silver.payments",
    "reviews":     "ecommerce.silver.reviews",
    "sellers":     "ecommerce.silver.sellers",
}

total = 0
for name, table in tables.items():
    count = spark.table(table).count()
    total += count
    print(f"{name:<20} {count:>10,} records")

print("-" * 45)
print(f"{'TOTAL':<20} {total:>10,} records")

# ── Show sample silver orders ────────────────────────────────────
print("\nSample silver.orders:")
spark.table("ecommerce.silver.orders") \
    .select("order_id", "order_status",
            "order_purchase_timestamp",
            "delivery_delay_days") \
    .show(5)

# ── Show sentiment distribution ──────────────────────────────────
print("Review sentiment distribution:")
spark.table("ecommerce.silver.reviews") \
    .groupBy("sentiment") \
    .count() \
    .orderBy("count", ascending=False) \
    .show()


SILVER LAYER — FINAL VERIFICATION
orders                   99,441 records
customers                99,441 records
products                 32,951 records
order_items             112,650 records
payments                103,883 records
reviews                  98,410 records
sellers                   3,095 records
---------------------------------------------
TOTAL                   549,871 records

Sample silver.orders:
+--------------------+------------+------------------------+-------------------+
|            order_id|order_status|order_purchase_timestamp|delivery_delay_days|
+--------------------+------------+------------------------+-------------------+
|f373335aac9a659de...|     SHIPPED|     2018-03-17 15:32:31|               NULL|
|118045506e1c1dda0...|   DELIVERED|     2018-03-08 19:06:05|                  7|
|cc66dee6fbc18bb79...|   DELIVERED|     2018-04-12 14:37:29|                -17|
|f44cb69655f8e4d13...|   DELIVERED|     2018-07-13 22:22:57|                 -6|
|edcc6b79e