In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("SilverLayerETL")
    .master("local[*]")
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.1.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

In [2]:
bronze_base = r"C:\Users\User\Desktop\E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines\bronze_layer\bronze_data"
silver_base = r"C:\Users\User\Desktop\E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines\silver_layer\silver_data"

datasets = ["customers", "products", "orders", "deliveries"]

In [3]:
base_path = r"C:\Users\User\Desktop\E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines\bronze_layer"

spark.read.format("delta").load(f"{base_path}\\bronze_data\\customers\\data").createOrReplaceTempView("customers")
spark.read.format("delta").load(f"{base_path}\\bronze_data\\orders\\data").createOrReplaceTempView("orders")
spark.read.format("delta").load(f"{base_path}\\bronze_data\\products\\data").createOrReplaceTempView("products")
spark.read.format("delta").load(f"{base_path}\\bronze_data\\deliveries\\data").createOrReplaceTempView("deliveries")

In [4]:
bronze_customers = spark.read.format("delta").load(f"{bronze_base}\\customers\\data")
bronze_products = spark.read.format("delta").load(f"{bronze_base}\\products\\data")
bronze_orders = spark.read.format("delta").load(f"{bronze_base}\\orders\\data")
bronze_deliveries = spark.read.format("delta").load(f"{bronze_base}\\deliveries\\data")

Transformation of Customer table

In [5]:
spark.sql("SELECT * FROM customers LIMIT 10").show()

+-------------+----------+---------+--------------------+----------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+-----------------+
|  customer_id|first_name|last_name|               email|     phone|             address|                city|             country|signup_date|       last_updated| ingestion_timestamp|             name|
+-------------+----------+---------+--------------------+----------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+-----------------+
|CUST-1C7B4FE4|   William|     Ross|william.ross78@ya...|3730730184|52224 May Valleys...|       South Matthew|    Christmas Island| 2025-06-02|2025-09-08 22:53:37|2025-09-26 18:35:...|     William Ross|
|CUST-2E039EDB|    Jeremy|     Byrd|jeremy.byrd22@gma...|1807627048|   459 Edwards Parks|North Christopher...|               Chile| 2020-12-25|2025-09-08 22:53:37|2025-09-26 18:35:...|    

In [6]:
bronze_customers.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- signup_date: date (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- name: string (nullable = true)



In [7]:
from pyspark.sql.functions import *

silver_customers = bronze_customers.withColumn("last_updated_ts", to_timestamp(col("last_updated")))\
                                   .dropDuplicates(["customer_id"])\
                                   .withColumn("full_name", concat_ws(" ", col("first_name"), col("last_name")))\
                                   .withColumn("signup_date", to_timestamp(col("signup_date")))\
                                   .withColumn("ingestion_timestamp", current_timestamp())\
                                   .select("customer_id", "first_name", "last_name", "full_name", "email", "phone", 
                                           "address", "city", "country", "signup_date", "last_updated", "ingestion_timestamp")


In [8]:
silver_customers.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- full_name: string (nullable = false)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- signup_date: timestamp (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = false)



Transformation of Products Table

In [9]:
spark.sql("SELECT * FROM products LIMIT 10").show()

+------------+--------------------+---------------+-------+--------------+----------+-------------------+--------------------+
|  product_id|                name|       category|  price|stock_quantity|created_at|       last_updated| ingestion_timestamp|
+------------+--------------------+---------------+-------+--------------+----------+-------------------+--------------------+
|PRD-C5F26AC0|     Toys - True 571|           Toys|1806.07|           670|2021-10-01|2025-09-08 22:27:07|2025-09-26 16:52:...|
|PRD-AA6575B5|   Health - Goal 523|         Health| 121.04|           572|2021-05-10|2025-09-08 22:27:07|2025-09-26 16:52:...|
|PRD-369D4F6A|Clothing - Debate...|       Clothing| 751.87|            68|2024-09-30|2025-09-08 22:27:07|2025-09-26 16:52:...|
|PRD-E2377939| Groceries - For 286|      Groceries|1024.81|           602|2021-06-05|2025-09-08 22:27:07|2025-09-26 16:52:...|
|PRD-2AC55643|    Books - Meet 351|          Books|1094.27|           378|2025-04-11|2025-09-08 22:27:07|2025-0

In [10]:
bronze_products.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- stock_quantity: integer (nullable = true)
 |-- created_at: date (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)



In [11]:
silver_products = (
    bronze_products
    .withColumn("last_updated_ts", to_timestamp(col("last_updated")))
    .dropDuplicates(["product_id"])
    .withColumn("price", col("price").cast("double"))
    .withColumn("stock_quantity", col("stock_quantity").cast("int"))
    .withColumn("ingestion_timestamp", current_timestamp())
    .select("product_id", "name", "category", "price", "stock_quantity",
            "created_at", "last_updated", "ingestion_timestamp")
)

In [12]:
silver_products.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- stock_quantity: integer (nullable = true)
 |-- created_at: date (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = false)



Transformation of Orders table

In [13]:
spark.sql("SELECT * FROM orders LIMIT 10").show()

+------------+----------------+-------------+------------+--------+------------+----------+--------------+------------+-------------------+--------------------+
|    order_id|  transaction_id|  customer_id|  product_id|quantity|total_amount|order_date|payment_method|order_status|       last_updated| ingestion_timestamp|
+------------+----------------+-------------+------------+--------+------------+----------+--------------+------------+-------------------+--------------------+
|ORD-E1C38356|TXN-A0948EFF52B7|CUST-B0B237C6|PRD-DDF1D934|       2|      215.42|2021-09-27|          Cash|     Pending|2025-09-08 22:30:58|2025-09-26 16:52:...|
|ORD-EB9DA7BA|TXN-36F837351544|CUST-C63BEB53|PRD-7CD795ED|       5|     6867.05|2022-12-16|   Credit Card|     Pending|2025-09-08 22:30:58|2025-09-26 16:52:...|
|ORD-5374CFE5|TXN-412B81C8A807|CUST-883A190F|PRD-816E4975|       2|      2133.0|2025-08-15|           UPI|   Cancelled|2025-09-08 22:30:58|2025-09-26 16:52:...|
|ORD-344C0DA9|TXN-4ADE6F3CDA8C|CUS

In [14]:
bronze_orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- order_date: date (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)



In [15]:
silver_orders = (
    bronze_orders
    .withColumn("order_date", to_timestamp(col("order_date")))
    .withColumn("total_amount", col("total_amount").cast("double"))
    .dropDuplicates(["order_id"])
    .withColumn("ingestion_timestamp", current_timestamp())
    .select("order_id", "transaction_id", "customer_id", "product_id", 
            "quantity", "total_amount", "order_date", "payment_method", 
            "order_status", "last_updated", "ingestion_timestamp")
)

In [16]:
silver_orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = false)



Transformation of Deliveries table

In [17]:
spark.sql("SELECT * FROM deliveries LIMIT 10").show()

+--------------------+--------------------+--------------------+-------------+-----------------+------------+--------------------+------------+--------------+--------------------+-----------------+--------------------+-------------+---------------+-------------------+--------------------+
|         delivery_id|            order_id|      transaction_id|  customer_id|    customer_name|  product_id|        product_name|total_amount|payment_method|    delivery_address|    delivery_city|    delivery_country|delivery_date|delivery_status|       last_updated| ingestion_timestamp|
+--------------------+--------------------+--------------------+-------------+-----------------+------------+--------------------+------------+--------------+--------------------+-----------------+--------------------+-------------+---------------+-------------------+--------------------+
|a296558b-0cbc-45f...|01dc2825-75d3-479...|8007443d-c10a-404...|CUST-950A2904| Tracey Carpenter|PRD-F50E9D5A| Beauty - Either 214|

In [18]:
bronze_orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- order_date: date (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)



In [19]:
silver_deliveries = (
    bronze_deliveries
    .withColumn("delivery_date", to_timestamp(col("delivery_date")))
    .withColumn("total_amount", col("total_amount").cast("double"))
    .dropDuplicates(["delivery_id"])
    .withColumn("ingestion_timestamp", current_timestamp())
    .select("delivery_id", "order_id", "transaction_id", "customer_id", 
            "customer_name", "product_id", "product_name", "total_amount", 
            "payment_method", "delivery_address", "delivery_city", "delivery_country",
            "delivery_date", "delivery_status", "last_updated", "ingestion_timestamp")
)

In [20]:
silver_deliveries.printSchema()

root
 |-- delivery_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- delivery_address: string (nullable = true)
 |-- delivery_city: string (nullable = true)
 |-- delivery_country: string (nullable = true)
 |-- delivery_date: timestamp (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = false)



Adding Surrogate Keys 

Surrogate key for Customer table

In [21]:
silver_customers = silver_customers.withColumn("customer_sk", monotonically_increasing_id())  


In [22]:
silver_customers.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- full_name: string (nullable = false)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- signup_date: timestamp (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = false)
 |-- customer_sk: long (nullable = false)



Surrogate Key for products table

In [23]:
silver_products = silver_products.withColumn("product_sk",monotonically_increasing_id())

In [24]:
silver_products.printSchema()


root
 |-- product_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- stock_quantity: integer (nullable = true)
 |-- created_at: date (nullable = true)
 |-- last_updated: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = false)
 |-- product_sk: long (nullable = false)



Dumping into Silver Delta Table

In [25]:
# Silver Customers
silver_customers.write.format("delta") \
    .mode("append") \
    .save(f"{silver_base}\\customers\\data")

# Silver Products
silver_products.write.format("delta") \
    .mode("append") \
    .save(f"{silver_base}\\products\\data")

# Silver Orders
silver_orders.write.format("delta") \
    .mode("append") \
    .save(f"{silver_base}\\orders\\data")

# Silver Deliveries
silver_deliveries.write.format("delta") \
    .mode("append") \
    .save(f"{silver_base}\\deliveries\\data")
