In [0]:
%run "../01_setup/01_config"

In [0]:
from pyspark.sql import functions as F

In [0]:
hosa_path = f"{bronze_path}/hosa/departments"
hosb_path = f"{bronze_path}/hosb/departments"

target_table = "healthcare_rcm_databricks.silver.departments"

In [0]:
df_hosa = spark.read.format('parquet')\
    .option('header', True)\
        .load(hosa_path)

df_hosb = spark.read.format('parquet')\
    .option('header', True)\
        .load(hosb_path)

In [0]:
# unionByName is safer than union() in case columns are in different order
df_merged = df_hosa.unionByName(df_hosb)

In [0]:
#df_merged.display()

## Transformations

In [0]:
df_merged = df_merged.withColumnsRenamed({'DeptID': 'src_dept_id', "Name": "dept_name", "datasource": "data_source"})

In [0]:
print("Total records before dropping dupliactes: ", df_merged.count())

In [0]:
df_merged = df_merged.dropDuplicates(subset = ['src_dept_id', 'data_source'])

print("Total records After dropping dupliactes: ", df_merged.count())


In [0]:
df_merged = df_merged.withColumn("dept_name", F.initcap(F.trim(F.col("dept_name"))))

In [0]:
df_silver = df_merged.withColumn('silver_ingest_date', F.lit(F.current_timestamp()))

In [0]:
df_silver = df_silver.withColumn('dept_key', F.concat(F.col("src_dept_id"), F.lit('-'), F.col("data_source")))

In [0]:
df_silver = df_silver.withColumn(
    "is_quarantined",
    F.when(F.col("src_dept_id").isNull() | F.col("dept_name").isNull(), F.lit(True)).otherwise(F.lit(False))
)

In [0]:
df_final = df_silver.select('src_dept_id', 'dept_key', 'dept_name', 'data_source', 'silver_ingest_date', 'is_quarantined')

## Load to Silver Layer

In [0]:
row_count = df_final.count()

if row_count > 0:
    (df_final.write
        .format("delta")           # Must use Delta for Silver/Gold
        .mode("overwrite")        # Full Load Strategy (Replace old data)
        .clusterBy('dept_key')        
        .option("overwriteSchema", "true") # Handle schema changes gracefully
        .saveAsTable(target_table) # Managed Table (Databricks handles the path)
    )

else:
    print("Source data is empty. Skipping overwrite to prevent data loss.")
    dbutils.notebook.exit("FAILED: Empty Source")


In [0]:
display(spark.sql(f"SELECT * FROM {target_table}"))