In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# ============================================================
# 1. READ RAW FILE
# ============================================================
df_raw = (
    spark.read
        .text("<file_location>.csv")
        .withColumnRenamed("value", "raw_line")
)

# ============================================================
# 2. NORMALIZE DELIMITERS â†’ ALWAYS SEMICOLON
# ============================================================
df_norm = (
    df_raw
        .withColumn("step1", F.regexp_replace("raw_line", r"\t", ";"))
        .withColumn("step2", F.regexp_replace("step1", r",", ";"))
        .withColumn("step3", F.regexp_replace("step2", r";{2,}", ";"))
        .withColumn("step4", F.regexp_replace("step3", r"[^\x00-\x7F]", ""))  # remove emojis
        .withColumn("clean_line", F.trim("step4"))
        .select("clean_line")
)

# ============================================================
# 3. SPLIT INTO FIELDS SAFELY
# ============================================================
df_split = df_norm.withColumn("fields", F.split("clean_line", ";"))

df_final = df_split.select(
    F.expr("get(fields, 0)").alias("order_id"),
    F.expr("get(fields, 1)").alias("customer_id"),
    F.expr("get(fields, 2)").alias("product_code"),
    F.expr("get(fields, 3)").alias("quantity"),
    F.expr("get(fields, 4)").alias("order_date"),
    F.expr("get(fields, 5)").alias("total_amount"),
    F.expr("get(fields, 6)").alias("status")
)

# Remove header
df_final = df_final.filter(F.col("order_id") != "order_id")

# ============================================================
# 4. CLEAN + NORMALIZE ALL COLUMNS
# ============================================================

# ---- customer_id ----
df_clean = df_final.withColumn(
    "customer_id",
    F.when(F.col("customer_id").rlike("^C\\d{3}$"), F.col("customer_id")).otherwise(None)
)

# ---- product_code ----
df_clean = df_clean.withColumn(
    "product_code",
    F.upper(F.regexp_replace("product_code", r"[^A-Za-z]", ""))
).withColumn(
    "product_code",
    F.when(F.trim("product_code") == "", None).otherwise(F.col("product_code"))
)

# ---- quantity ----
# ---- total_amount (SAFE WITH try_cast) ----
df_clean = df_clean.withColumn(
    "total_amount",
    F.regexp_replace("total_amount", r"[^0-9.]", "")
).withColumn(
    "total_amount",
    F.expr("try_cast(total_amount AS double)")
)


# ---- order_date ----
df_clean = df_clean.withColumn(
    "order_date_clean",
    F.coalesce(
        F.try_to_date("order_date", "yyyy-MM-dd"),
        F.try_to_date("order_date", "yyyy/MM/dd"),
        F.try_to_date("order_date", "dd/MM/yyyy"),
        F.try_to_date("order_date", "MM-dd-yy"),
        F.try_to_date("order_date", "MMMM dd, yyyy")
    )
).drop("order_date").withColumnRenamed("order_date_clean", "order_date")

# ---- total_amount (SAFE WITH try_cast) ----
df_clean = df_clean.withColumn(
    "total_amount",
    F.regexp_replace("total_amount", r"[^0-9.]", "")
).withColumn(
    "total_amount",
    F.expr("try_cast(total_amount AS double)")
)

# ---- status ----
df_clean = df_clean.withColumn(
    "status",
    F.lower(F.trim(F.regexp_replace("status", r"[^A-Za-z]", "")))
)

# ============================================================
# 5. RESEQUENCE ORDER_ID (ALWAYS START AT O001)
# ============================================================
df_tmp = df_clean.withColumn("dummy", F.lit(1))
w = Window.partitionBy("dummy").orderBy(F.monotonically_increasing_id())

df_silver = (
    df_tmp
        .withColumn("seq", F.row_number().over(w))
        .withColumn("order_id", F.concat(F.lit("O"), F.lpad("seq", 3, "0")))
        .drop("seq", "dummy")
)

df_silver.write.format("delta").mode("overwrite").save(
    "<output_location>orders"
)



In [0]:
df_orders.count()


150

In [0]:
df_orders.printSchema()


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_code: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- status: string (nullable = true)
 |-- order_date: date (nullable = true)



In [0]:
df_orders.show(20, truncate=False)


+--------+-----------+------------+----------+------------+---------+----------+
|order_id|customer_id|product_code|quantity  |total_amount|status   |order_date|
+--------+-----------+------------+----------+------------+---------+----------+
|O001    |NULL       |CODE        |1         |206.42      |pending  |2026-01-14|
|O002    |NULL       |TVWR        |1.0       |2026.0      |         |NULL      |
|O003    |NULL       |BAD         |1.0       |250.84      |pending  |NULL      |
|O004    |C096       |CODE        |1         |652.85      |NULL     |2026-01-21|
|O005    |C049       |ZERO        |01-16-26  |NULL        |NULL     |NULL      |
|O006    |NULL       |NULL        |not_a_date|NULL        |NULL     |NULL      |
|O007    |C129       |CODE        |-5        |NULL        |pending  |2026-01-02|
|O008    |C089       |CODE        |          |240.41      |completed|2026-01-03|
|O009    |C150       |UJDD        |zero      |2026.0      |         |NULL      |
|O010    |C073       |BAD   