In [0]:
from silver.Transform_Functions.Trim import func_trim_col
from pyspark.sql import functions as F
from pyspark.sql.functions import trim,col,length

# READING FROM BRONZE TABLE

In [0]:
df = spark.table("workspace.bronze.erp_cust_az12_raw")

# Data Transformation

## Trim whitespaces

In [0]:
df = func_trim_col(df)

## Modify prefix in CID Column - Prefix "NASAW" to "AW"

In [0]:
df = (
    df
    .withColumn(
        "CID",
        F.when(
            col("CID").isNull(),
            None
        ).otherwise(
            F.regexp_replace(col("CID"), r"^NAS", "")
        )
        )
)

## Normalize Abbrevation

In [0]:
df = (
    df.withColumn(
        "GEN",
        F.when((F.upper(F.col("GEN")) == "M") | (F.upper(F.col("GEN")) == "MALE"), "Male")
        .when((F.upper(F.col("GEN")) == "F") | (F.upper(F.col("GEN")) == "FEMALE"), "Female")
        .otherwise("n/a")
    )
)

## Check and drop duplicate - CID

In [0]:
if df.select("CID").distinct().count() != df.select("CID").count():
    print("Duplicates present...")
    print("Removing duplicate CID records...")
    df = df.dropDuplicates(["CID"])
else:
    print("No duplicates")

## Birth date Validation 

In [0]:
df = (
    df.withColumn(
        "BDATE",
        F.when(
            col("BDATE") > F.current_date(),
            None
        ).otherwise(
            col("BDATE")
        )
    
))

## Renaming the columns

In [0]:
erpCust_table_header = {
    "CID": "customer_key",
    "BDATE": "birth_date",
    "GEN": "gender"
    }


In [0]:
for old_header, new_header in erpCust_table_header.items():
    df = df.withColumnRenamed(old_header, new_header)

In [0]:
df.display()

# Write into Silver Schema

In [0]:
(
    df.write.mode("overwrite").format("delta").saveAsTable("silver.erp_customers")
)


In [0]:
%sql
SELECT * FROM silver.erp_customers
LIMIT 5;