### Loading the medical raw dataset into volume

### Bronze Layer 

In [0]:
import dlt
from pyspark.sql.functions import *

@dlt.table(name="bronze_hospital_dataset", comment="Raw hospital data")
def load_patients():
    return (
        spark.read.option("header", True).csv("/Volumes/dev_catalog/default/raw_volume/hospital_datasets/hospital1_claim_data.csv")
        .withColumn("ingest_timestamp", current_timestamp())
    )



In [0]:
@dlt.table(name="bronze_transactions")
def load_encounters():
    return spark.read.option("header", True).csv("/Volumes/dev_catalog/default/raw_volume/hospital_datasets/transactions.csv")

### Silver Layer

In [0]:
import dlt
from pyspark.sql.functions import col, to_date, year, month

@dlt.table(name="silver_claims_transactions")
def transform_claims_transactions():
    claims_df = dlt.read("bronze_hospital_dataset")
    txn_df = dlt.read("bronze_transactions")

    # Clean + type cast claims
    Hospital_df = (
        claims_df
        .dropna(subset=["ClaimID", "EncounterID"])
        .withColumn("Deductible", col("Deductible").cast("double"))
        .withColumn("Coinsurance", col("Coinsurance").cast("double"))
        .withColumn("Copay", col("Copay").cast("double"))
        .withColumn("PaidAmount", col("PaidAmount").cast("double"))
        .withColumn("ClaimStatus", col("ClaimStatus").cast("string"))
        .dropDuplicates(["ClaimID"])
    )

    # Clean + type cast transactions
    trans_df = (
        txn_df
        .dropna(subset=["TransactionID", "EncounterID", "Amount"])
        .withColumn("Amount", col("Amount").cast("double"))
        .withColumn("TransactionDate", to_date(col("TransactionDate"), "yyyy-MM-dd"))
        .dropDuplicates(["TransactionID"])
    )

    # Join on encounter_id
    joined_df = (
        Hospital_df .join(trans_df, on="EncounterID", how="inner")
        .withColumn("NetPayment", col("PaidAmount") - (col("Deductible") + col("Coinsurance") + col("Copay")))
        .withColumn("ClaimYear", year("TransactionDate"))
        .withColumn("ClaimMonth", month("TransactionDate"))
    )

    return joined_df




### Gold Layer

In [0]:
@dlt.table(name="gold_claim_summary")
def claim_summary():
    df = dlt.read("silver_claims_transactions")

    return (
        df.groupBy("ClaimStatus", "ClaimYear", "ClaimMonth")
          .agg(
              sum("PaidAmount").alias("TotalPaidAmount"),
              sum("NetPayment").alias("TotalNetPayment"),
              countDistinct("ClaimID").alias("Unique_Claims")
          )
    )


In [0]:
print("hello world")

In [0]:
%sql
select * from dev_catalog.default.newsilver_claims_transactions


In [0]:
%sql
select * from dev_catalog.default.gold_claim_summary