## Bronze to Silver: Data Cleaning and Transformation for Dimension Tables

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, DateType, TimestampType, FloatType

catalog_name = 'ecommerce'

### Brands

In [0]:
df_bronze = spark.table(f'{catalog_name}.bronze.brz_brands')
df_bronze.show(10)

+----------+-----------+-------------+--------------------+--------------------+
|brand_code| brand_name|category_code|        _source_file|         ingested_at|
+----------+-----------+-------------+--------------------+--------------------+
|      ACME|   AcmeTech|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      NOVW|  NovaWave |           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      ZNTH|     Zenith|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      BYTM|    ByteMax|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      ECOT|    EcoTone|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      SKYL|    SkyLink|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|     VOLT@|   VoltEdge|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      PHTX|   Photonix|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      URTL| UrbanTrail|          APP|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      COTC| CottonClub|    

In [0]:
df_silver = df_bronze.withColumn("brand_name", F.trim(F.col("brand_name")))
df_silver.show(10)

+----------+----------+-------------+--------------------+--------------------+
|brand_code|brand_name|category_code|        _source_file|         ingested_at|
+----------+----------+-------------+--------------------+--------------------+
|      ACME|  AcmeTech|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      NOVW|  NovaWave|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      ZNTH|    Zenith|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      BYTM|   ByteMax|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      ECOT|   EcoTone|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      SKYL|   SkyLink|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|     VOLT@|  VoltEdge|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      PHTX|  Photonix|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      URTL|UrbanTrail|          APP|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      COTC|CottonClub|          APP|dbf

In [0]:
df_silver = df_silver.withColumn("brand_code", F.regexp_replace(F.col("brand_code"), r'[^A-Za-z0-9]', ''))
df_silver.show(10)

+----------+----------+-------------+--------------------+--------------------+
|brand_code|brand_name|category_code|        _source_file|         ingested_at|
+----------+----------+-------------+--------------------+--------------------+
|      ACME|  AcmeTech|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      NOVW|  NovaWave|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      ZNTH|    Zenith|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      BYTM|   ByteMax|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      ECOT|   EcoTone|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      SKYL|   SkyLink|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      VOLT|  VoltEdge|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      PHTX|  Photonix|           CE|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      URTL|UrbanTrail|          APP|dbfs:/Volumes/eco...|2025-11-08 05:10:...|
|      COTC|CottonClub|          APP|dbf

In [0]:
df_silver.select("category_code").distinct().show()

+-------------+
|category_code|
+-------------+
|           CE|
|          APP|
|          HNK|
|          BPC|
|        BOOKS|
|          BKS|
|      GROCERY|
|         GRCY|
|          TOY|
|         TOYS|
|          SPT|
+-------------+



In [0]:
# Anomalies dictionary
anomalies = {
    "GROCERY": "GRCY",
    "BOOKS": "BKS",
    "TOYS": "TOY"
}

# PySpark replace is easy
df_silver = df_silver.replace(to_replace=anomalies, subset=["category_code"])

# âœ… Show results
df_silver.select("category_code").distinct().show()

+-------------+
|category_code|
+-------------+
|           CE|
|          APP|
|          HNK|
|          BPC|
|          BKS|
|         GRCY|
|          TOY|
|          SPT|
+-------------+



In [0]:
# Write raw data to the silver layer (catalog: ecommerce, schema: silver, table: slv_brands)
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_brands")

### Category

In [0]:
df_bronze = spark.table(f"{catalog_name}.bronze.brz_category")

df_bronze.show(10)

+-------------+--------------------+--------------------+--------------------+
|category_code|       category_name|        _ingested_at|        _source_file|
+-------------+--------------------+--------------------+--------------------+
|           ce|         Electronics|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|          app|             Apparel|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|          hnk|      Home & Kitchen|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|          bpc|Beauty & Personal...|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|          bks|               Books|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|         grcy|             Grocery|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|          toy|        Toys & Games|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|          spt|   Sports & Outdoors|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|          app|             Apparel|2025-11-08 05:11:...|dbfs:/Volumes/eco...|
|         grcy|             Grocery|2025-11-08 05:11

In [0]:
df_duplicates = df_bronze.groupBy("category_code").count().filter(F.col("count") > 1)
display(df_duplicates)

category_code,count
app,2
grcy,2


In [0]:
df_silver = df_bronze.dropDuplicates(['category_code'])
display(df_silver)

category_code,category_name,_ingested_at,_source_file
ce,Electronics,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
app,Apparel,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
hnk,Home & Kitchen,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
bpc,Beauty & Personal Care,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
bks,Books,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
grcy,Grocery,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
toy,Toys & Games,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
spt,Sports & Outdoors,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv


In [0]:
df_silver = df_silver.withColumn("category_code", F.upper(F.col("category_code")))
display(df_silver)

category_code,category_name,_ingested_at,_source_file
CE,Electronics,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
APP,Apparel,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
HNK,Home & Kitchen,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
BPC,Beauty & Personal Care,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
BKS,Books,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
GRCY,Grocery,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
TOY,Toys & Games,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv
SPT,Sports & Outdoors,2025-11-08T05:11:05.835Z,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/category/category.csv


In [0]:
# Write raw data to the silver layer (catalog: ecommerce, schema: silver, table: slv_category)
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_category")

### Products

In [0]:
# Read the raw data from the bronze table (ecommerce.bronze.brz_calendar)
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_products")

# Get row and column count
row_count, column_count = df_bronze.count(), len(df_bronze.columns)

# Print the results
print(f"Row count: {row_count}")
print(f"Column count: {column_count}")

Row count: 50000
Column count: 14


In [0]:
display(df_bronze.limit(5))

product_id,sku,category_code,brand_code,color,size,material,weight_grams,length_cm,width_cm,height_cm,rating_count,file_name,ingest_timestamp
2000000000015,STCR-HNK-00001,hnk,stcr,White,One-Size,Coton,305g,222,17.1,6.3,0,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/products/products.csv,2025-11-08T05:11:34.097Z
2000000000022,HMNS-HNK-00002,hnk,hmns,Silver,One-Size,Steel,682g,182,12.3,3.7,1,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/products/products.csv,2025-11-08T05:11:34.097Z
2000000000039,NOVW-CE-00003,ce,novw,Purple,One-Size,Wood,243g,182,13.9,4.2,0,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/products/products.csv,2025-11-08T05:11:34.097Z
2000000000046,URTL-APP-00004,app,urtl,Silver,S,Ruber,225g,176,4.6,5.8,50,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/products/products.csv,2025-11-08T05:11:34.097Z
2000000000053,GGRN-GRC-00005,grcy,ggrn,Silver,One-Size,Ruber,455g,272,15.8,7.4,-4,dbfs:/Volumes/ecommerce/source_data/ecommerce_store/products/products.csv,2025-11-08T05:11:34.097Z


Check `weight_grams` (contains 'g')

In [0]:
# Check weight_grams column
df_bronze.select("weight_grams").show(5, truncate=False)

+------------+
|weight_grams|
+------------+
|305g        |
|682g        |
|243g        |
|225g        |
|455g        |
+------------+
only showing top 5 rows


In [0]:
# replace 'g' with ''
df_silver = df_bronze.withColumn(
    "weight_grams",
    F.regexp_replace(F.col("weight_grams"), "g", "").cast(IntegerType())
)
df_silver.select("weight_grams").show(5, truncate=False)

+------------+
|weight_grams|
+------------+
|305         |
|682         |
|243         |
|225         |
|455         |
+------------+
only showing top 5 rows


Check `length_cm` (comma instead of dot)

In [0]:
df_silver.select("length_cm").show(3)

+---------+
|length_cm|
+---------+
|     22,2|
|     18,2|
|     18,2|
+---------+
only showing top 3 rows


In [0]:
# replace , with .
df_silver = df_silver.withColumn(
    "length_cm",
    F.regexp_replace(F.col("length_cm"), ",", ".").cast(FloatType())
)
df_silver.select("length_cm").show(3)

+---------+
|length_cm|
+---------+
|     22.2|
|     18.2|
|     18.2|
+---------+
only showing top 3 rows


`category_code` and `brand_code` are in lower case. we need to make it all upper case

In [0]:
df_silver.select("category_code", "brand_code").show(2)

+-------------+----------+
|category_code|brand_code|
+-------------+----------+
|          hnk|      stcr|
|          hnk|      hmns|
+-------------+----------+
only showing top 2 rows


In [0]:
# convert category_code and brand_code to upper case
df_silver = df_silver.withColumn(
    "category_code",
    F.upper(F.col("category_code"))
).withColumn(
    "brand_code",
    F.upper(F.col("brand_code"))
)
df_silver.select("category_code", "brand_code").show(2)

+-------------+----------+
|category_code|brand_code|
+-------------+----------+
|          HNK|      STCR|
|          HNK|      HMNS|
+-------------+----------+
only showing top 2 rows


Spelling mistakes in `material` column

In [0]:
df_silver.select("material").distinct().show()

+---------+
| material|
+---------+
|    Coton|
|    Steel|
|     Wood|
|    Ruber|
|  Plastic|
|Polyester|
|    Glass|
|  Alumium|
|    Paper|
|  Leather|
+---------+



In [0]:
# Fix spelling mistakes
df_silver = df_silver.withColumn(
    "material",
    F.when(F.col("material") == "Coton", "Cotton")
     .when(F.col("material") == "Alumium", "Aluminum")
     .when(F.col("material") == "Ruber", "Rubber")
     .otherwise(F.col("material"))
)
df_silver.select("material").distinct().show()    

+---------+
| material|
+---------+
|   Cotton|
|    Steel|
|     Wood|
|   Rubber|
|  Plastic|
|Polyester|
|    Glass|
| Aluminum|
|    Paper|
|  Leather|
+---------+



Negative values in `rating_count`

In [0]:
df_silver.filter(F.col('rating_count')<0).select("rating_count").show(3)


+------------+
|rating_count|
+------------+
|          -4|
|          -2|
|          -2|
+------------+
only showing top 3 rows


In [0]:
# Convert negative rating_count to positive
df_silver = df_silver.withColumn(
    "rating_count",
    F.when(F.col("rating_count").isNotNull(), F.abs(F.col("rating_count")))
     .otherwise(F.lit(0))  # if null, replace with 0
)

In [0]:
# Check final cleaned data

df_silver.select(
    "weight_grams",
    "length_cm",
    "category_code",
    "brand_code",
    "material",
    "rating_count"
).show(10, truncate=False)

+------------+---------+-------------+----------+---------+------------+
|weight_grams|length_cm|category_code|brand_code|material |rating_count|
+------------+---------+-------------+----------+---------+------------+
|305         |22.2     |HNK          |STCR      |Cotton   |0           |
|682         |18.2     |HNK          |HMNS      |Steel    |1           |
|243         |18.2     |CE           |NOVW      |Wood     |0           |
|225         |17.6     |APP          |URTL      |Rubber   |50          |
|455         |27.2     |GRCY         |GGRN      |Rubber   |4           |
|232         |28.0     |BPC          |SLKE      |Plastic  |0           |
|507         |27.2     |CE           |VOLT      |Plastic  |5           |
|261         |27.7     |APP          |CBLT      |Polyester|0           |
|59          |12.5     |SPT          |ARFT      |Plastic  |11          |
|238         |10.7     |APP          |MOSA      |Polyester|6           |
+------------+---------+-------------+----------+--

In [0]:
# Write raw data to the silver layer (catalog: ecommerce, schema: silver, table: slv_dim_products)
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_products")

### Customers

In [0]:
# Read the raw data from the bronze table (ecommerce.bronze.brz_calendar)
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_customers")

# Get row and column count
row_count, column_count = df_bronze.count(), len(df_bronze.columns)

# Print the results
print(f"Row count: {row_count}")
print(f"Column count: {column_count}")

df_bronze.show(10)

Row count: 300000
Column count: 7
+----------------+--------------+------------+--------------+-----+--------------------+--------------------+
|     customer_id|         phone|country_code|       country|state|           file_name|    ingest_timestamp|
+----------------+--------------+------------+--------------+-----+--------------------+--------------------+
|CUST000000000001|917280033536.0|          IN|         India|   MH|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|CUST000000000002|619489725433.0|          AU|     Australia|  VIC|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|CUST000000000003|919390066524.0|          IN|         India|   TN|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|CUST000000000004|917073741793.0|          IN|         India|   TN|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|CUST000000000005|618478772532.0|          AU|     Australia|   WA|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|CUST000000000006|916441718520.0|          IN|         India|   GJ|dbfs:/Volumes/eco..

Handle NULL values in `customer_id` column

In [0]:
null_count = df_bronze.filter(F.col("customer_id").isNull()).count()
null_count

300

In [0]:
# There are 300 null values in customer_id column. Display some of those
df_bronze.filter(F.col("customer_id").isNull()).show(3)

+-----------+--------------+------------+-------+-----+--------------------+--------------------+
|customer_id|         phone|country_code|country|state|           file_name|    ingest_timestamp|
+-----------+--------------+------------+-------+-----+--------------------+--------------------+
|       NULL|918187043562.0|          IN|  India|   DL|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|       NULL|917517243052.0|          IN|  India|   DL|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|       NULL|          NULL|          IN|  India|   GJ|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
+-----------+--------------+------------+-------+-----+--------------------+--------------------+
only showing top 3 rows


In [0]:
# Drop rows where 'customer_id' is null
df_silver = df_bronze.dropna(subset=["customer_id"])

# Get row count
row_count = df_silver.count()
print(f"Row count after droping null values: {row_count}")

Row count after droping null values: 299700


Handle NULL values in `phone` column

In [0]:
null_count = df_silver.filter(F.col("phone").isNull()).count()
print(f"Number of nulls in phone: {null_count}") 

Number of nulls in phone: 29964


In [0]:
df_silver.filter(F.col("phone").isNull()).show(3)

+----------------+-----+------------+-------+-----+--------------------+--------------------+
|     customer_id|phone|country_code|country|state|           file_name|    ingest_timestamp|
+----------------+-----+------------+-------+-----+--------------------+--------------------+
|CUST000000000007| NULL|          IN|  India|   MH|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|CUST000000000010| NULL|          IN|  India|   RJ|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
|CUST000000000026| NULL|          IN|  India|   WB|dbfs:/Volumes/eco...|2025-11-08 05:11:...|
+----------------+-----+------------+-------+-----+--------------------+--------------------+
only showing top 3 rows


In [0]:
### Fill null values with 'Not Available'
df_silver = df_silver.fillna("Not Available", subset=["phone"])

# sanity check (If any nulls still exist)
df_silver.filter(F.col("phone").isNull()).show()

+-----------+-----+------------+-------+-----+---------+----------------+
|customer_id|phone|country_code|country|state|file_name|ingest_timestamp|
+-----------+-----+------------+-------+-----+---------+----------------+
+-----------+-----+------------+-------+-----+---------+----------------+



In [0]:
# Write raw data to the silver layer (catalog: ecommerce, schema: silver, table: slv_customers)
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_customers")

### Calendar/Date

In [0]:
# Read the raw data from the bronze table (ecommerce.bronze.brz_calendar)
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_calendar")

# Get row and column count
row_count, column_count = df_bronze.count(), len(df_bronze.columns)

# Print the results
print(f"Row count: {row_count}")
print(f"Column count: {column_count}")

df_bronze.show(3)

Row count: 95
Column count: 7
+----------+----+--------+-------+------------+--------------------+--------------------+
|      date|year|day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+--------+-------+------------+--------------------+--------------------+
|01-08-2025|2025|  friday|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|02-08-2025|2025|SATURDAY|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|03-08-2025|2025|  SUNDAY|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
+----------+----+--------+-------+------------+--------------------+--------------------+
only showing top 3 rows


In [0]:
df_bronze.printSchema()

root
 |-- date: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- day_name: string (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- _ingested_at: timestamp (nullable = true)
 |-- _source_file: string (nullable = true)



Converting String to Date

In [0]:
from pyspark.sql.functions import to_date


# Convert the string column to a date type
df_silver = df_bronze.withColumn("date", to_date(df_bronze["date"], "dd-MM-yyyy"))

In [0]:
print(df_silver.printSchema())

df_silver.show(5)

root
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- day_name: string (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- _ingested_at: timestamp (nullable = true)
 |-- _source_file: string (nullable = true)

None
+----------+----+--------+-------+------------+--------------------+--------------------+
|      date|year|day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+--------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|  friday|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-02|2025|SATURDAY|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|  SUNDAY|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-04|2025|  MONDAY|      3|         -32|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-05|2025| TUESDAY|      3|         -32|2025-11-08 05:12:...|dbfs

Remove Duplicates

In [0]:
# Find duplicate rows in the DataFrame
duplicates = df_silver.groupBy('date').count().filter("count > 1")

# Show the duplicate rows
print("Total duplicated Rows: ", duplicates.count())
display(duplicates)

Total duplicated Rows:  3


date,count
2025-08-29,2
2025-09-25,2
2025-10-13,2


In [0]:
# Remove duplicate rows
df_silver = df_silver.dropDuplicates(['date'])

# Get row count
row_count = df_silver.count()

print("Rows After removing Duplicates: ", row_count)

Rows After removing Duplicates:  92


`day_name` normalize casing

In [0]:
# Capitalize first letter of each word in day_name
df_silver = df_silver.withColumn("day_name", F.initcap(F.col("day_name")))

df_silver.show(5)

+----------+----+--------+-------+------------+--------------------+--------------------+
|      date|year|day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+--------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|  Friday|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-02|2025|Saturday|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|  Sunday|      3|         -31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-04|2025|  Monday|      3|         -32|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-05|2025| Tuesday|      3|         -32|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
+----------+----+--------+-------+------------+--------------------+--------------------+
only showing top 5 rows


Convert negative `week_of_year` to positive

In [0]:
df_silver = df_silver.withColumn("week_of_year", F.abs(F.col("week_of_year")))  # Convert negative to positive

df_silver.show(3)

+----------+----+--------+-------+------------+--------------------+--------------------+
|      date|year|day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+--------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|  Friday|      3|          31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-02|2025|Saturday|      3|          31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|  Sunday|      3|          31|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
+----------+----+--------+-------+------------+--------------------+--------------------+
only showing top 3 rows


Enhance `quarter` and `week_of_year` column

In [0]:
df_silver = df_silver.withColumn("quarter", F.concat_ws("", F.concat(F.lit("Q"), F.col("quarter"), F.lit("-"), F.col("year"))))

df_silver = df_silver.withColumn("week_of_year", F.concat_ws("-", F.concat(F.lit("Week"), F.col("week_of_year"), F.lit("-"), F.col("year"))))

df_silver.show(3)

+----------+----+--------+-------+------------+--------------------+--------------------+
|      date|year|day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+--------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|  Friday|Q3-2025| Week31-2025|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-02|2025|Saturday|Q3-2025| Week31-2025|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|  Sunday|Q3-2025| Week31-2025|2025-11-08 05:12:...|dbfs:/Volumes/eco...|
+----------+----+--------+-------+------------+--------------------+--------------------+
only showing top 3 rows


Rename columns

In [0]:
# Rename a column
df_silver = df_silver.withColumnRenamed("week_of_year", "week")

In [0]:
# Write raw data to the silver layer (catalog: ecommerce, schema: silver, table: slv_calendar)
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_calendar")