In [0]:
%sql
select * from dev_catalog.default.bronze_customer

In [0]:
# ---------------------------------------------
# Silver Notebook (silver_clean_customer.py)
# ---------------------------------------------
from pyspark.sql.functions import col, regexp_extract

# Table & path setup
catalog = "dev_catalog"
schema = "default"
bronze_table = f"{catalog}.{schema}.bronze_customer"
silver_table = f"{catalog}.{schema}.silver_customer"
checkpoint_path = "/Volumes/dev_catalog/default/raw_json_data/checkpoints/silver_customerv2/"

# Read from Bronze Delta table
bronze_df = spark.readStream.format("delta").table(bronze_table)

# Filter, clean, and transform
df_cleaned = (
    bronze_df
    .filter(col("operation").isin("insert", "update"))
    .dropna(subset=["customer_id", "email", "name"])
    .dropDuplicates(["customer_id", "email"])
    .withColumn("name", regexp_extract(col("name"), r"(Customer \d+)", 1))
    .drop("address", "zip_code")
)

# Write to Silver Delta table
df_cleaned.writeStream \
    .format("delta") \
    .option("checkpointLocation", checkpoint_path) \
    .option("mergeSchema", "true")\
    .outputMode("append") \
    .trigger(availableNow=True)\
    .table(silver_table)


In [0]:
%sql
select * from  dev_catalog.default.silver_customer

In [0]:
%sql
ALTER TABLE dev_catalog.default.silver_customer
SET TBLPROPERTIES ('delta.columnMapping.mode' = 'name');


In [0]:
%sql
ALTER TABLE dev_catalog.default.silver_customer
DROP COLUMN city;
