In [0]:
# BRONZE LAYER - Raw Data Ingestion
# Purpose: Load raw CSVs from Unity Catalog Volume into Delta tables

from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StringType

print("="*70)
print("BRONZE LAYER - Loading Raw Data from Volume into Delta Tables")
print("="*70)

# Base path - Unity Catalog Volume
base_path = "/Volumes/workspace/default/olist_data/"

# Define all CSV files (EXACT names from Olist dataset)
csv_files = {
    "customers": "olist_customers_dataset.csv",
    "orders": "olist_orders_dataset.csv",
    "order_items": "olist_order_items_dataset.csv",
    "order_payments": "olist_order_payments_dataset.csv",
    "order_reviews": "olist_order_reviews_dataset.csv",
    "products": "olist_products_dataset.csv",
    "sellers": "olist_sellers_dataset.csv",
    "geolocation": "olist_geolocation_dataset.csv",
    "product_category_translation": "product_category_name_translation.csv"
}

# Track progress
success_count = 0
error_count = 0
total_rows = 0

# Create Bronze Delta Tables
for table_name, file_name in csv_files.items():
    print(f"\n{'='*70}")
    print(f"üì• Processing: {table_name}")
    print(f"{'='*70}")
    
    try:
        # Read CSV - DON'T infer schema, read everything as strings first
        df = spark.read \
            .option("header", "true") \
            .option("inferSchema", "false") \
            .option("encoding", "UTF-8") \
            .csv(f"{base_path}{file_name}")
        
        row_count = df.count()
        col_count = len(df.columns)
        
        print(f"   üìä Read: {row_count:,} rows √ó {col_count} columns")
        
        # Add metadata column (tracks when data was loaded)
        df = df.withColumn("ingestion_timestamp", current_timestamp())
        
        # Write to Delta table (Bronze layer)
        # Using workspace.default catalog
        df.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(f"workspace.default.bronze_{table_name}")
        
        print(f"   ‚úÖ Created: workspace.default.bronze_{table_name}")
        
        success_count += 1
        total_rows += row_count
    
    except Exception as e:
        print(f"   ‚ùå Error loading {table_name}:")
        print(f"      {str(e)[:200]}")
        error_count += 1

# Summary
print("\n" + "="*70)
print("üéâ BRONZE LAYER COMPLETE!")
print("="*70)
print(f"‚úÖ Success: {success_count}/{len(csv_files)} tables created")
print(f"üìä Total rows loaded: {total_rows:,}")
if error_count > 0:
    print(f"‚ùå Errors: {error_count} tables failed")
print("="*70)

# Verify: List all Bronze tables
print("\nüìã Bronze Tables Created:\n")
bronze_tables = spark.sql("""
    SHOW TABLES IN workspace.default 
""").filter("tableName LIKE 'bronze_%'")

bronze_tables.select("tableName").show(truncate=False)

# Show sample from customers table
print("\nüëÄ Sample from bronze_customers:\n")
spark.table("workspace.default.bronze_customers") \
    .select("customer_id", "customer_unique_id", "customer_city", "customer_state") \
    .show(5, truncate=False)

print("\n‚úÖ Ready for Silver Layer transformation!")
print("\nüí° NOTE: All data loaded as STRING type - we'll cast to proper types in Silver layer")