In [0]:
import dlt
from pyspark.sql.functions import *

#Bronze Load for MemberData CSV
@dlt.table(
    comment="Bronze Load for MemberData CSV",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    }
)
def BronzeMembers():
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("/Volumes/capstone_hospital/data/raw/members.csv")
    return df


#Bronze Load for Diagnosis CSV
@dlt.table(
    comment="Bronze Load for Diagnosis CSV",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    }
)
def BronzeDiagnosis():
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("/Volumes/capstone_hospital/data/raw/diagnosis_ref.csv")
    return df


#Bronze Load for Claim CSV
@dlt.table(
    comment="Bronze Load for Claim CSV",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    }
)
def BronzeClaim():
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("/Volumes/capstone_hospital/data/raw/claims_batch.csv")
    return df



[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-4760633582222195>, line 1[0m
[0;32m----> 1[0m [38;5;28;01mimport[39;00m [38;5;21;01mdlt[39;00m
[1;32m      2[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m[38;5;21;01m.[39;00m[38;5;21;01mfunctions[39;00m [38;5;28;01mimport[39;00m col
[1;32m      4[0m [38;5;129m@dlt[39m[38;5;241m.[39mtable(
[1;32m      5[0m   name[38;5;241m=[39m[38;5;124m"[39m[38;5;124mcapstone_hospital.bronze_members[39m[38;5;124m"[39m,
[1;32m      6[0m   comment[38;5;241m=[39m[38;5;124m"[39m[38;5;124mRaw members data ingested to Bronze layer[39m[38;5;124m"[39m,
[0;32m   (...)[0m
[1;32m     10[0m )
[1;32m     11[0m [38;5;28;01mdef[39;00m [38;5;21mbronze_members[39m():

File [0;32m/databricks/python_shell/lib/dbruntime/au

In [0]:
# Bronze Load for Claims Stream JSON
@dlt.table(
    comment="Bronze Load for Claims Stream JSON",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    }
)
def BronzeClaimsStream():
    return (
        spark.read.format("json")
        .load("/Volumes/capstone_hospital/data/raw/claims_stream.json"))

# Bronze Load for Providers JSON
@dlt.table(
    comment="Bronze Load for Providers JSON",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    }
)
def BronzeProviders():
    return (
        spark.read.format("json")
        .load("/Volumes/capstone_hospital/data/raw/providers.json")
    )



In [0]:
# SilverView: Member : cleaned data including quality checks
@dlt.table(
  name="SilverMembers",
  comment="Cleaned and deduplicated for Silver sales data",
   table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)
def SilverMembers():
    df = dlt.read("BronzeMembers")
    return df.filter(col("memberID").isNotNull()).dropDuplicates(["memberID"])

# SilverView: diagnosis : cleaned data including quality checks
@dlt.table(
    name="SilverDiagnosis",
    comment="Cleaned data including quality checks",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)
def SilverDiagnosis():
    return dlt.read("BronzeDiagnosis").filter(col("code").isNotNull()).dropDuplicates(["code"])

# SilverView: diagnosis : cleaned data including quality checks
@dlt.table(
    name="SilverClaim",
    comment="Cleaned data including quality checks",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)
def SilverClaim():
    return dlt.read("BronzeClaim").filter(col("ClaimID").isNotNull()).dropDuplicates(["ClaimID"])    

 # SilverView: claims stream : cleaned data including quality checks
@dlt.table(
    name="silveClaimsStream",
    comment="Cleaned data including quality checks",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)
def SilverClaim():
    return dlt.read("BronzeClaimsStream").filter(col("ClaimID").isNotNull()).dropDuplicates(["ClaimID"])    

 # SilverView: provider : cleaned data including quality checks
@dlt.table(
    name="SilverProviders",
    comment="Cleaned data including quality checks",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)
def SilverClaim():
    return dlt.read("BronzeProviders").filter(col("ProviderID").isNotNull()).dropDuplicates(["ProviderID"])    

In [0]:
@dlt.table(
    name="GoldFraudDetection",
    comment="Joined with Filtered data for Fraud Detection"
)
def GoldFraudDetection():
    # Load the tables
    SilverClaim = spark.read.table("capstone_hospital.default.SilverClaim")
    SilverDiagnosis = spark.read.table("capstone_hospital.default.SilverDiagnosis")
    SilverMembers = spark.read.table("capstone_hospital.default.SilverMembers")

    # Explode ICD10Codes into multiple rows
    exploded_claims = SilverClaim.withColumn("ICD10Code", explode(split(col("ICD10Codes"), ",")))

    # Filter rows where ClaimDate is more than 24 hours before ServiceDate
    filtered_claims = exploded_claims.filter(datediff(col("ClaimDate"), col("ServiceDate")) > 1)

    # Join the tables
    result = filtered_claims.join(SilverMembers, "MemberID") \
                            .join(silver_diagnosis, filtered_claims["ICD10Code"] == silver_diagnosis["code"])

    return result