In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

Check Aggregated data

In [0]:
%sql
SELECT 
    year(month_window.start) as Year
  , month(month_window.start) as m
  , date_format(month_window.start, 'MMMM') as Month
  , MemberRegion as Region
  , coalesce(ClaimType, 'Undentified') as ClaimType
  , sum(number_of_claims) as NumberOfClaims 
  , sum(total_amount) as TotalAmount
FROM healthcare.gold.claims_monthly_summary 
GROUP BY
    Year
  , m
  , Month
  , MemberRegion
  , ClaimType 
ORDER BY
    Year DESC
  , m DESC
  , MemberRegion ASC
  , ClaimType ASC

Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT * FROM healthcare.gold.fraud_alerts 

Ingest & Create Views

In [0]:
claims_df = spark.table("healthcare.silver.claims")
claims_df.createOrReplaceTempView("v_claims")

In [0]:
%sql
SELECT * FROM v_claims WHERE MemberRegion = 'North' LIMIT 10;

Deduplicate & Validate Keys

In [0]:
%sql
-- check duplicate, should be return no rows.
SELECT
  ClaimID, MemberID, ProviderID,
  COUNT(*) as occurrences
FROM v_claims
GROUP BY ClaimID, MemberID, ProviderID
HAVING COUNT(*) > 1

In [0]:
%sql
-- Check FK, should be return no rows.
SELECT c.ClaimID, c.MemberID
FROM v_claims c
LEFT ANTI JOIN healthcare.bronze.members m ON c.MemberID = m.MemberID

Data Profiling & Parse Nested JSON

In [0]:
%sql
DESCRIBE TABLE healthcare.silver.claims;

In [0]:
%sql
SELECT COUNT(*) FROM v_claims WHERE Amount IS NULL;

Pivot Data for Reporting

In [0]:
pivot_df = (spark.table("v_claims")
            .groupBy("MemberRegion")
            .pivot("ClaimType") 
            .sum("Amount")      
            .na.fill(0))     # fill 0 if null or empty

display(pivot_df)

UDFs

In [0]:
def get_amount_category(amount):
  if amount is None:
    return "Unknown"
  elif amount < 500:
    return "Low"
  elif amount < 3000:
    return "Medium"
  else:
    return "High"

# create function as UDF 
amount_category_udf = udf(get_amount_category, StringType())
claims_with_category_df = spark.table("v_claims").withColumn("AmountCategory", amount_category_udf("Amount"))

display(claims_with_category_df.select("ClaimID", "Amount", "AmountCategory"))

UPSERT

In [0]:
%sql
SELECT * FROM healthcare.silver.claims_base WHERE ClaimID IN ('CL000078', 'CL999999')

In [0]:
update_data = [
    ("CL000078", 555.55, "Approved", "2025-09-08T18:00:00.000+0000"),  
    ("CL999999", 123.45, "Submitted", "2025-09-08T18:00:00.000+0000")   
]
columns = ["ClaimID", "Amount", "Status", "timestamp"]
update_df = spark.createDataFrame(update_data, columns)
update_df.createOrReplaceTempView("claim_updates_vw")

In [0]:
%sql 
MERGE INTO healthcare.silver.claims_base AS target
USING claim_updates_vw AS source
ON target.ClaimID = source.ClaimID
WHEN MATCHED THEN
  UPDATE SET
    target.Amount = source.Amount,
    target.Status = source.Status,
    target.timestamp = source.timestamp
WHEN NOT MATCHED THEN
  INSERT (ClaimID, Amount, Status, timestamp)
  VALUES (source.ClaimID, source.Amount, source.Status, source.timestamp);

In [0]:
%sql
SELECT * FROM healthcare.silver.claims_base WHERE ClaimID IN ('CL000078', 'CL999999')

COPY INTO

In [0]:
%sql  
CREATE TABLE IF NOT EXISTS healthcare.bronze.members_by_copy (
  MemberID STRING, `Name` STRING, DOB DATE, Gender STRING, Region STRING,
  PlanType STRING, EffectiveDate DATE, Email STRING, IsActive BOOLEAN, LastUpdated DATE
);
 
COPY INTO healthcare.bronze.members_by_copy
FROM (
  SELECT
    MemberID,
    Name,
    CAST(DOB AS DATE) AS DOB,  
    Gender,
    Region,
    PlanType,
    CAST(EffectiveDate AS DATE) AS EffectiveDate,  
    Email,
    CAST(CAST(IsActive AS DOUBLE) AS BOOLEAN) AS IsActive, 
    CAST(LastUpdated AS DATE) AS LastUpdated  
  FROM '/Volumes/healthcare/default/data/members.csv'
)
FILEFORMAT = CSV
FORMAT_OPTIONS ('header' = 'true');

In [0]:
%sql
SELECT * FROM healthcare.bronze.members_by_copy LIMIT 10

Delta Lake ACID Features: Time Travel, Rollback, OPTIMIZE

In [0]:
%sql
DESCRIBE HISTORY healthcare.bronze.members_by_copy;

In [0]:
%sql
DELETE FROM healthcare.bronze.members_by_copy WHERE `Region` = 'Central';

In [0]:
%sql
SELECT COUNT(*) FROM healthcare.bronze.members_by_copy VERSION AS OF 1 WHERE Region = 'Central';

In [0]:
%sql
RESTORE TABLE healthcare.bronze.members_by_copy TO VERSION AS OF 1;

In [0]:
%sql
OPTIMIZE healthcare.bronze.members_by_copy ZORDER BY (Region);

In [0]:
%sql
VACUUM healthcare.bronze.members_by_copy;