In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# ============================================================
# 1. READ RAW FILE
# ============================================================
df_raw = (
    spark.read
        .text("<file_location>products.csv")
        .withColumnRenamed("value", "raw_line")
)

# ============================================================
# 2. NORMALIZE DELIMITERS â†’ ALWAYS SEMICOLON
# ============================================================
df_norm = (
    df_raw
        .withColumn("step1", F.regexp_replace("raw_line", r"\t", ";"))
        .withColumn("step2", F.regexp_replace("step1", r",", ";"))
        .withColumn("step3", F.regexp_replace("step2", r";{2,}", ";"))
        .withColumn("step4", F.regexp_replace("step3", r"[^\x00-\x7F]", ""))  # remove emojis + accents
        .withColumn("clean_line", F.trim("step4"))
        .select("clean_line")
)

# ============================================================
# 3. SPLIT INTO FIELDS SAFELY
# ============================================================
df_split = df_norm.withColumn("fields", F.split("clean_line", ";"))

df_final = df_split.select(
    F.expr("get(fields, 0)").alias("product_id"),
    F.expr("get(fields, 1)").alias("product_code"),
    F.expr("get(fields, 2)").alias("product_name"),
    F.expr("get(fields, 3)").alias("category"),
    F.expr("get(fields, 4)").alias("price"),
    F.expr("get(fields, 5)").alias("status")
)

# Remove header
df_final = df_final.filter(F.col("product_id") != "product_id")

# ============================================================
# 4. CLEAN + NORMALIZE ALL COLUMNS
# ============================================================

# ---- product_id ----
df_clean = df_final.withColumn(
    "product_id",
    F.when(F.col("product_id").rlike("^P\\d{3}$"), F.col("product_id")).otherwise(None)
)

# ---- product_code ----
df_clean = df_clean.withColumn(
    "product_code",
    F.upper(F.regexp_replace("product_code", r"[^A-Za-z0-9]", ""))
).withColumn(
    "product_code",
    F.when(F.trim("product_code") == "", None).otherwise(F.col("product_code"))
)

# ---- product_name ----
df_clean = df_clean.withColumn(
    "product_name",
    F.trim(F.regexp_replace("product_name", r"[^A-Za-z0-9_ ]", ""))
)

# ---- category ----
df_clean = df_clean.withColumn(
    "category",
    F.trim(F.regexp_replace("category", r"[^A-Za-z]", ""))
)

# ---- price (SAFE CAST) ----
df_clean = df_clean.withColumn(
    "price",
    F.regexp_replace("price", r"[^0-9.]", "")
).withColumn(
    "price",
    F.expr("try_cast(price AS double)")
)

# ---- status ----
df_clean = df_clean.withColumn(
    "status",
    F.lower(F.trim(F.regexp_replace("status", r"[^A-Za-z]", "")))
)

# ============================================================
# 5. WRITE SILVER TO DELTA
# ============================================================
df_clean.write.format("delta").mode("overwrite").save(
    "<outputlocation>"
)


In [0]:
df_orders = spark.read.format("delta").load(
    "<output_location>"
)
df_orders.count()
df_orders.printSchema()
df_orders.show(20, truncate=False)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_code: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- status: string (nullable = true)
 |-- order_date: date (nullable = true)

+--------+-----------+------------+----------+------------+---------+----------+
|order_id|customer_id|product_code|quantity  |total_amount|status   |order_date|
+--------+-----------+------------+----------+------------+---------+----------+
|O001    |NULL       |CODE        |1         |206.42      |pending  |2026-01-14|
|O002    |NULL       |TVWR        |1.0       |2026.0      |         |NULL      |
|O003    |NULL       |BAD         |1.0       |250.84      |pending  |NULL      |
|O004    |C096       |CODE        |1         |652.85      |NULL     |2026-01-21|
|O005    |C049       |ZERO        |01-16-26  |NULL        |NULL     |NULL      |
|O006    |NULL       |NULL        |not_a_date|NULL       

In [0]:
%sql
SELECT *
FROM delta.`/Volumes/workspace/default/project3/silver/orders`;


order_id,customer_id,product_code,quantity,total_amount,status,order_date
O001,,CODE,1,206.42,pending,2026-01-14
O002,,TVWR,1.0,2026.0,,
O003,,BAD,1.0,250.84,pending,
O004,C096,CODE,1,652.85,,2026-01-21
O005,C049,ZERO,01-16-26,,,
O006,,,not_a_date,,,
O007,C129,CODE,-5,,pending,2026-01-02
O008,C089,CODE,,240.41,completed,2026-01-03
O009,C150,UJDD,zero,2026.0,,
O010,C073,BAD,1.0,569.36,pending,2026-01-15
