In [0]:
from pyspark.sql import functions as F

landing = "/Volumes/ecommerce/bronze/landing_zone"

def ingest_csv_to_bronze(source_file, target_table):
    """
    Reads a CSV file and writes to Bronze Delta table.
    Adds _ingestion_time and _source_file metadata columns.
    """
    print(f"Ingesting: {source_file} → {target_table}")
    
    df = (
        spark.read
            .option("header", "true")        # first row = column names
            .option("inferSchema", "false")  # keep everything as STRING (Bronze rule)
            .option("multiLine", "true")     # handle multi-line text fields (reviews)
            .option("escape", '"')           # handle quoted fields
            .csv(f"{landing}/{source_file}")
            .withColumn("_ingestion_time", F.current_timestamp())
            .withColumn("_source_file", F.lit(f"{landing}/{source_file}"))
    )
    
    (df.write
        .format("delta")
        .mode("overwrite")   # overwrite for initial load
        .option("mergeSchema", "true")
        .saveAsTable(target_table)
    )
    
    count = spark.table(target_table).count()
    print(f"Done! {count:,} records loaded into {target_table}\n")


# ── Ingest all 7 files ──────────────────────────────────────────
ingest_csv_to_bronze(
    "olist_orders_dataset.csv",
    "ecommerce.bronze.raw_orders"
)

ingest_csv_to_bronze(
    "olist_order_items_dataset.csv",
    "ecommerce.bronze.raw_order_items"
)

ingest_csv_to_bronze(
    "olist_customers_dataset.csv",
    "ecommerce.bronze.raw_customers"
)

ingest_csv_to_bronze(
    "olist_products_dataset.csv",
    "ecommerce.bronze.raw_products"
)

ingest_csv_to_bronze(
    "olist_order_payments_dataset.csv",
    "ecommerce.bronze.raw_payments"
)

ingest_csv_to_bronze(
    "olist_order_reviews_dataset.csv",
    "ecommerce.bronze.raw_reviews"
)

ingest_csv_to_bronze(
    "olist_sellers_dataset.csv",
    "ecommerce.bronze.raw_sellers"
)

Ingesting: olist_orders_dataset.csv → ecommerce.bronze.raw_orders
Done! 99,441 records loaded into ecommerce.bronze.raw_orders

Ingesting: olist_order_items_dataset.csv → ecommerce.bronze.raw_order_items
Done! 112,650 records loaded into ecommerce.bronze.raw_order_items

Ingesting: olist_customers_dataset.csv → ecommerce.bronze.raw_customers
Done! 99,441 records loaded into ecommerce.bronze.raw_customers

Ingesting: olist_products_dataset.csv → ecommerce.bronze.raw_products
Done! 32,951 records loaded into ecommerce.bronze.raw_products

Ingesting: olist_order_payments_dataset.csv → ecommerce.bronze.raw_payments
Done! 103,886 records loaded into ecommerce.bronze.raw_payments

Ingesting: olist_order_reviews_dataset.csv → ecommerce.bronze.raw_reviews
Done! 99,224 records loaded into ecommerce.bronze.raw_reviews

Ingesting: olist_sellers_dataset.csv → ecommerce.bronze.raw_sellers
Done! 3,095 records loaded into ecommerce.bronze.raw_sellers



In [0]:
print("=" * 55)
print("BRONZE LAYER — FINAL VERIFICATION")
print("=" * 55)

tables = {
    "raw_orders":      "ecommerce.bronze.raw_orders",
    "raw_order_items": "ecommerce.bronze.raw_order_items",
    "raw_customers":   "ecommerce.bronze.raw_customers",
    "raw_products":    "ecommerce.bronze.raw_products",
    "raw_payments":    "ecommerce.bronze.raw_payments",
    "raw_reviews":     "ecommerce.bronze.raw_reviews",
    "raw_sellers":     "ecommerce.bronze.raw_sellers",
}

total_records = 0
for name, table in tables.items():
    count = spark.table(table).count()
    total_records += count
    print(f"{name:<20} {count:>10,} records")

print("-" * 45)
print(f"{'TOTAL':<20} {total_records:>10,} records")

BRONZE LAYER — FINAL VERIFICATION
raw_orders               99,441 records
raw_order_items         112,650 records
raw_customers            99,441 records
raw_products             32,951 records
raw_payments            103,886 records
raw_reviews              99,224 records
raw_sellers               3,095 records
---------------------------------------------
TOTAL                   550,688 records


In [0]:
# These are the issues Silver layer will fix

print("ORDER STATUS distribution:")
spark.table("ecommerce.bronze.raw_orders") \
    .groupBy("order_status") \
    .count() \
    .orderBy("count", ascending=False) \
    .show()

print("NULL values in raw_orders:")
from pyspark.sql.functions import col, sum as spark_sum, when

df = spark.table("ecommerce.bronze.raw_orders")
null_counts = df.select([
    spark_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df.columns
])
null_counts.show()

print("PAYMENT TYPE distribution:")
spark.table("ecommerce.bronze.raw_payments") \
    .groupBy("payment_type") \
    .count() \
    .orderBy("count", ascending=False) \
    .show()


ORDER STATUS distribution:
+------------+-----+
|order_status|count|
+------------+-----+
|   delivered|96478|
|     shipped| 1107|
|    canceled|  625|
| unavailable|  609|
|    invoiced|  314|
|  processing|  301|
|     created|    5|
|    approved|    2|
+------------+-----+

NULL values in raw_orders:
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+---------------+------------+
|order_id|customer_id|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|_ingestion_time|_source_file|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+---------------+------------+
|       0|          0|           0|                       0|              160|                        

In [0]:
from pyspark.sql.functions import col, sum as spark_sum, when

df = spark.table("ecommerce.bronze.raw_orders")

# Display nulls in a readable vertical format
null_counts = [(c, df.filter(col(c).isNull()).count()) for c in df.columns]

print("NULL VALUES IN raw_orders:")
print("-" * 40)
for column, nulls in null_counts:
    status = "Warning" if nulls > 0 else "Good"
    print(f"{status} {column:<35} {nulls:>6} nulls")


NULL VALUES IN raw_orders:
----------------------------------------
Good order_id                                 0 nulls
Good customer_id                              0 nulls
Good order_status                             0 nulls
Good order_purchase_timestamp                 0 nulls
Good order_estimated_delivery_date            0 nulls
Good _ingestion_time                          0 nulls
Good _source_file                             0 nulls
