In [0]:
import dlt
from pyspark.sql.functions import *

In [0]:
VOLUME_PATH = "/Volumes/healthcare/default/data"

**Bronze**

In [0]:
@dlt.table(name="bronze.claims_batch", 
        comment="Raw historical claims from batch CSV files.",
        table_properties={
            "quality": "bronze"
        }
    )
def bronze_claims_batch():
    return (
        spark.read.format("csv").option("header", "true").option("inferSchema", "true")
        .load(f"{VOLUME_PATH}/claims_batch.csv")
    )

@dlt.table(name="bronze.claims_stream", 
        comment="New claims from streaming JSON files.",
        table_properties={
            "quality": "bronze"
        }
    )
def bronze_claims_stream():
    return (
        spark.read.format("json").option("multiline", "true")
        .load(f"{VOLUME_PATH}/claims_stream.json")
    )

@dlt.table(name="bronze.members", 
        comment="Raw members master data from CSV."
        ,
        table_properties={
            "quality": "bronze"
        }
    )
def bronze_members():
    return (
        spark.read.format("csv").option("header", "true").option("inferSchema", "true")
        .load(f"{VOLUME_PATH}/members.csv")
    )

@dlt.table(name="bronze.providers", 
        comment="Raw providers directory data from JSON.",
        table_properties={
            "quality": "bronze"
        }
    )
def bronze_providers():
    return (
        spark.read.format("json").option("multiline", "true")
        .load(f"{VOLUME_PATH}/providers.json")
    )

@dlt.table(name="bronze.diagnosis_ref",
        comment="Reference data for diagnosis codes (ICD-10).",
        table_properties={
            "quality": "bronze"
        }
    )
def bronze_diagnosis_ref():
    return (
        spark.read.format("csv").option("header", "true").option("inferSchema", "true")
        .load(f"{VOLUME_PATH}/diagnosis_ref.csv")
    )


**Silver**

In [0]:
@dlt.view(
    name="claims_prepared_base",
    comment="Intermediate view for base claims enrichment (member, provider)."
)
def claims_prepared_base():
    batch_claims = dlt.read_stream("bronze.claims_batch")
    stream_claims = dlt.read_stream("bronze.claims_stream")
    all_claims = batch_claims.unionByName(stream_claims, allowMissingColumns=True)

    members = dlt.read("bronze.members")
    providers = dlt.read("bronze.providers")

    return (
        all_claims.alias("c")
        .join(members.alias("m"), "MemberID", "inner")
        .join(providers.alias("p"), "ProviderID", "inner")
        .withColumn("timestamp", coalesce(col("c.EventTimestamp"), col("c.IngestTimestamp")).cast("timestamp"))
        .select(
            col("c.ClaimID"), col("c.MemberID"), col("m.Name").alias("MemberName"),
            col("m.Region").alias("MemberRegion"), col("c.ProviderID"),
            col("p.Name").alias("ProviderName"), col("c.ClaimDate").cast("date"),
            col("c.Amount").cast("double"), col("c.Status"), col("c.ClaimType"),
            col("c.ICD10Codes"), col("timestamp")
        )
    )

dlt.create_streaming_live_table(
    name="silver.claims_base",
    comment="Base cleansed claims data before complex aggregations."
)

dlt.apply_changes(
    target="silver.claims_base",
    source="claims_prepared_base",
    keys=["ClaimID"],
    sequence_by=col("timestamp"),
    stored_as_scd_type=1
)