In [0]:
# Set default catalog so we don't have to type 
# ecommerce.bronze every time
spark.sql("USE CATALOG ecommerce")
print("Using ecommerce catalog")

Using ecommerce catalog


Bronze Layer Tables (Raw Schema)

In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.bronze.raw_orders (
        order_id                      STRING,
        customer_id                   STRING,
        order_status                  STRING,
        order_purchase_timestamp      STRING,
        order_approved_at             STRING,
        order_delivered_carrier_date  STRING,
        order_delivered_customer_date STRING,
        order_estimated_delivery_date STRING,
        _ingestion_time               TIMESTAMP,
        _source_file                  STRING
    )
    USING DELTA
    COMMENT 'Raw orders from Olist dataset'
""")
print("✅ raw_orders created")

✅ raw_orders created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.bronze.raw_order_items (
        order_id             STRING,
        order_item_id        STRING,
        product_id           STRING,
        seller_id            STRING,
        shipping_limit_date  STRING,
        price                STRING,
        freight_value        STRING,
        _ingestion_time      TIMESTAMP,
        _source_file         STRING
    )
    USING DELTA
    COMMENT 'Raw order items from Olist dataset'
""")
print("raw_order_items created")

raw_order_items created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.bronze.raw_customers (
        customer_id              STRING,
        customer_unique_id       STRING,
        customer_zip_code_prefix STRING,
        customer_city            STRING,
        customer_state           STRING,
        _ingestion_time          TIMESTAMP,
        _source_file             STRING
    )
    USING DELTA
    COMMENT 'Raw customers from Olist dataset'
""")
print("raw_customers created")


raw_customers created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.bronze.raw_products (
        product_id                 STRING,
        product_category_name      STRING,
        product_name_lenght        STRING,
        product_description_lenght STRING,
        product_photos_qty         STRING,
        product_weight_g           STRING,
        product_length_cm          STRING,
        product_height_cm          STRING,
        product_width_cm           STRING,
        _ingestion_time            TIMESTAMP,
        _source_file               STRING
    )
    USING DELTA
    COMMENT 'Raw products from Olist dataset'
""")
print("✅ raw_products created")

✅ raw_products created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.bronze.raw_payments (
        order_id              STRING,
        payment_sequential    STRING,
        payment_type          STRING,
        payment_installments  STRING,
        payment_value         STRING,
        _ingestion_time       TIMESTAMP,
        _source_file          STRING
    )
    USING DELTA
    COMMENT 'Raw payments from Olist dataset'
""")
print("raw_payments created")

raw_payments created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.bronze.raw_reviews (
        review_id               STRING,
        order_id                STRING,
        review_score            STRING,
        review_comment_title    STRING,
        review_comment_message  STRING,
        review_creation_date    STRING,
        review_answer_timestamp STRING,
        _ingestion_time         TIMESTAMP,
        _source_file            STRING
    )
    USING DELTA
    COMMENT 'Raw reviews from Olist dataset'
""")
print("raw_reviews created")

raw_reviews created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.bronze.raw_sellers (
        seller_id              STRING,
        seller_zip_code_prefix STRING,
        seller_city            STRING,
        seller_state           STRING,
        _ingestion_time        TIMESTAMP,
        _source_file           STRING
    )
    USING DELTA
    COMMENT 'Raw sellers from Olist dataset'
""")
print("raw_sellers created")

raw_sellers created


Silver Layer Tables (Cleaned Schema)

In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.silver.orders (
        order_id         STRING        NOT NULL,
        customer_id      STRING        NOT NULL,
        order_date       TIMESTAMP,
        status           STRING,
        order_total      DOUBLE,
        payment_method   STRING,
        shipping_address STRING,
        updated_at       TIMESTAMP
    )
    USING DELTA
    COMMENT 'Cleaned and validated orders'
    TBLPROPERTIES (
        'delta.enableChangeDataFeed' = 'true'
    )
""")
print("silver.orders created")

silver.orders created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.silver.order_items (
        order_item_id  STRING         NOT NULL,
        order_id       STRING         NOT NULL,
        product_id     STRING         NOT NULL,
        quantity       INTEGER,
        unit_price     DOUBLE,
        discount       DOUBLE,
        line_total     DOUBLE,
        updated_at     TIMESTAMP
    )
    USING DELTA
    COMMENT 'Cleaned order line items'
    TBLPROPERTIES (
        'delta.enableChangeDataFeed' = 'true'
    )
""")
print("silver.order_items created")


silver.order_items created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.silver.customers (
        customer_id    STRING         NOT NULL,
        first_name     STRING,
        last_name      STRING,
        email          STRING,
        phone          STRING,
        city           STRING,
        country        STRING,
        signup_date    DATE,
        segment        STRING,
        updated_at     TIMESTAMP
    )
    USING DELTA
    COMMENT 'Cleaned customer profiles'
    TBLPROPERTIES (
        'delta.enableChangeDataFeed' = 'true'
    )
""")
print("silver.customers created")

silver.customers created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.silver.products (
        product_id     STRING         NOT NULL,
        product_name   STRING,
        category       STRING,
        sub_category   STRING,
        brand          STRING,
        price          DOUBLE,
        cost           DOUBLE,
        margin         DOUBLE,
        stock_quantity INTEGER,
        updated_at     TIMESTAMP
    )
    USING DELTA
    COMMENT 'Cleaned product catalog'
    TBLPROPERTIES (
        'delta.enableChangeDataFeed' = 'true'
    )
""")
print("silver.products created")

silver.products created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.silver.clickstream (
        event_id        STRING        NOT NULL,
        session_id      STRING,
        customer_id     STRING,
        event_type      STRING,
        page_url        STRING,
        product_id      STRING,
        event_timestamp TIMESTAMP,
        device_type     STRING
    )
    USING DELTA
    COMMENT 'Cleaned clickstream events'
""")
print("silver.clickstream created")

silver.clickstream created


Gold Layer Tables (Aggregated Schema)

In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.gold.daily_revenue (
        order_day          DATE,
        product_category   STRING,
        total_orders       LONG,
        total_revenue      DOUBLE,
        avg_order_value    DOUBLE,
        unique_customers   LONG,
        updated_at         TIMESTAMP
    )
    USING DELTA
    COMMENT 'Daily revenue aggregated by product category'
""")
print("gold.daily_revenue created")

gold.daily_revenue created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.gold.customer_ltv (
        customer_id        STRING,
        first_name         STRING,
        last_name          STRING,
        email              STRING,
        segment            STRING,
        total_orders       LONG,
        lifetime_value     DOUBLE,
        avg_order_value    DOUBLE,
        first_order_date   DATE,
        last_order_date    DATE,
        customer_age_days  INTEGER,
        updated_at         TIMESTAMP
    )
    USING DELTA
    COMMENT 'Customer lifetime value metrics'
""")
print("gold.customer_ltv created")

gold.customer_ltv created


In [0]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS ecommerce.gold.product_performance (
        product_id         STRING,
        product_name       STRING,
        category           STRING,
        brand              STRING,
        total_units_sold   LONG,
        total_revenue      DOUBLE,
        total_profit       DOUBLE,
        profit_margin_pct  DOUBLE,
        return_rate        DOUBLE,
        updated_at         TIMESTAMP
    )
    USING DELTA
    COMMENT 'Product sales performance metrics'
""")
print("gold.product_performance created")

gold.product_performance created
