In [0]:
%run "../01_setup/01_config"

In [0]:
from pyspark.sql import functions as F

In [0]:
hosa_path = f"{bronze_path}/hosa/providers"
hosb_path = f"{bronze_path}/hosb/providers"

target_table = "healthcare_rcm_databricks.silver.providers"

In [0]:
df_hosa = spark.read.format('parquet')\
    .option('header', True)\
        .load(hosa_path)
df_hosb = spark.read.format('parquet')\
    .option('header', True)\
        .load(hosb_path)
df_merged = df_hosa.unionByName(df_hosb)

In [0]:
df_merged.display()

## Transformations

In [0]:
df_merged = df_merged.withColumnsRenamed({
    'ProviderID': 'src_provider_id',
    'FirstName': 'first_name',
    'LastName': 'last_name',
    'Specialization': 'specialization',
    'DeptID': 'src_dept_id',
    'NPI': 'npi_code',
    'datasource': 'data_source'
})

In [0]:
df_merged = df_merged.dropDuplicates(subset = ['src_provider_id', 'data_source'])

In [0]:
# Create Surrogate Key for the Provider itself
df_merged = df_merged.withColumn(
    'provider_key', # Unique across all hospitals
    F.concat(F.col('src_provider_id'), F.lit('-'), F.col('data_source'))
)

In [0]:
df_merged = df_merged.withColumn(
    'dept_id', F.concat(F.col('src_dept_id'), F.lit('-'), F.col('data_source'))
)

In [0]:
cols = ['first_name', 'last_name', 'specialization']

for c in cols:
    df_merged = df_merged.withColumn(c, F.initcap(F.regexp_replace(F.trim(F.col(c)), r"\s+", " ")))

In [0]:
df_merged = df_merged.withColumn(
    "is_quarantined", F.when(F.col('src_provider_id').isNull() | F.col('src_dept_id').isNull(), True).otherwise(False)
)

In [0]:
df_merged = df_merged.withColumn('silver_ingest_date', F.lit(F.current_timestamp()))

In [0]:
df_final = df_merged.select('provider_key', 'src_provider_id', 'first_name', 'last_name', 'specialization', 'src_dept_id', 'dept_id', 'npi_code', 'data_source', 'silver_ingest_date', 'is_quarantined')


## Load to Silver Layer

In [0]:
row_count = df_final.count()

if row_count > 0:
    df_final.write.format("delta")\
        .mode("overwrite")\
            .clusterBy("provider_key")\
            .option("overwriteSchema", True)\
                .saveAsTable(target_table)
else:
    print("Source data is empty. Skipping overwrite to prevent data loss.")
    dbutils.notebook.exit("FAILED: Empty Source")

In [0]:
display(spark.sql("select * from healthcare_rcm_databricks.silver.providers"))