In [0]:
# ===============================
# Healthcare Analytics Pipeline - DLT with Quality Checks
# ===============================

import dlt
from pyspark.sql.functions import col

# ===============================
# 1️⃣ Bronze Layer Quality Checks
# ===============================

@dlt.table(
    comment="Quality check for Bronze Patient Table",
    table_properties={
        "pipelines.autoOptimize.managed": "true"
    }
)
def bronze_patient_qc():
    df = dlt.read("processed_outputs.bronze.patient")
    
    # Expectation: No nulls in key demographic columns
    dlt.expect("patient_id_not_null", "PatientID IS NOT NULL")
    dlt.expect("state_not_null", "State IS NOT NULL")
    dlt.expect("sex_not_null", "Sex IS NOT NULL")
    
    # Expectation: AgeCategory should be positive
    dlt.expect("valid_age_category", "AgeCategory > 0")
    
    return df

@dlt.table(
    comment="Quality check for Bronze Hospital Table"
)
def bronze_hospital_qc():
    df = dlt.read("processed_outputs.bronze.hospital")
    
    # Expectation: HospitalID and Name should not be null
    dlt.expect("hospital_id_not_null", "Hospital_ID IS NOT NULL")
    dlt.expect("hospital_name_not_null", "Hospital_Name IS NOT NULL")
    
    return df

# ===============================
# 2️⃣ Silver Layer Quality Checks
# ===============================

@dlt.table(
    comment="Quality check for Silver Layer Processed Data"
)
def silver_qc():
    df = dlt.read("processed_outputs.silver.processed")
    
    # Deduplication check
    dlt.expect("no_duplicate_patient", "count(*) = count(distinct PatientID)")
    
    # Critical columns not null
    critical_cols = ["State", "Sex", "AgeCategory", "GeneralHealth"]
    for col_name in critical_cols:
        dlt.expect(f"{col_name.lower()}_not_null", f"{col_name} IS NOT NULL")
    
    # Numeric columns > 0
    numeric_cols = ["HeightInMeters", "WeightInKilograms", "BMI"]
    for col_name in numeric_cols:
        dlt.expect(f"{col_name.lower()}_positive", f"{col_name} > 0")
    
    return df

# ===============================
# 3️⃣ Gold Layer Quality Checks
# ===============================

@dlt.table(
    comment="Quality check for Gold Layer Health Metrics"
)
def gold_qc():
    df = dlt.read("processed_outputs.curated.health_metrics")
    
    # Check AvgRiskScore is not null
    dlt.expect("avg_riskscore_not_null", "AvgRiskScore IS NOT NULL")
    
    # Patient count positive
    dlt.expect("patient_count_positive", "PatientCount > 0")
    
    return df

@dlt.table(
    comment="Quality check for Gold Layer Hospital Rankings"
)
def hospital_ranking_qc():
    df = dlt.read("processed_outputs.curated.hospital_rankings")
    
    # Rank uniqueness
    dlt.expect("unique_hospital_rank", "Rank IS NOT NULL")
    
    return df

@dlt.table(
    comment="Quality check for Gold Layer Risk by Hospital"
)
def risk_by_hospital_qc():
    df = dlt.read("processed_outputs.curated.risk_by_hospital")
    
    dlt.expect("avg_riskscore_not_null", "AvgRiskScore IS NOT NULL")
    dlt.expect("patient_count_positive", "PatientCount > 0")
    
    return df

@dlt.table(
    comment="Quality check for Gold Layer Risk by Smoker Status"
)
def risk_by_smoker_qc():
    df = dlt.read("processed_outputs.curated.risk_by_smoker")
    
    dlt.expect("avg_riskscore_not_null", "AvgRiskScore IS NOT NULL")
    dlt.expect("patient_count_positive", "PatientCount > 0")
    
    return df

@dlt.table(
    comment="Quality check for Gold Layer Risk by Race"
)
def risk_by_race_qc():
    df = dlt.read("processed_outputs.curated.risk_by_race")
    
    dlt.expect("avg_riskscore_not_null", "AvgRiskScore IS NOT NULL")
    dlt.expect("patient_count_positive", "PatientCount > 0")
    
    return df
