In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_date, expr
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, BooleanType, DoubleType
from datetime import date

spark = SparkSession.builder.appName("SCD2_Example").getOrCreate()

# Define Schemas
provider_schema = StructType([
    StructField("provider_key", IntegerType(), False),
    StructField("provider_id", StringType(), False),
    StructField("provider_name", StringType(), True),
    StructField("specialty", StringType(), True),
    StructField("state", StringType(), True),
    StructField("start_date", DateType(), False),
    StructField("end_date", DateType(), True),
    StructField("is_current", BooleanType(), False)
])

claims_schema = StructType([
    StructField("claim_id", StringType(), False),
    StructField("member_id", StringType(), False),
    StructField("claim_date", DateType(), False),
    StructField("claim_amount", DoubleType(), True),
    StructField("provider_key", IntegerType(), True)
])

# Initial Provider Data
initial_provider_data = [
    (1, 'P101', 'Dr. Smith', 'Cardiology', 'NY', date(2020, 1, 1), None, True),
    (2, 'P102', 'Dr. Jones', 'Pediatrics', 'CA', date(2019, 5, 15), None, True)
]

# Create DataFrame and temporary view
dim_provider = spark.createDataFrame(initial_provider_data, schema=provider_schema)
dim_provider.createOrReplaceTempView("dim_provider_view")

# Create an empty claims table for now
fct_claims = spark.createDataFrame([], schema=claims_schema)
fct_claims.createOrReplaceTempView("fct_claims_view")

print("Initial Provider Dimension:")
display(dim_provider)

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, BooleanType
from datetime import datetime

# Define the schema
schema = StructType([
    StructField("provider_key", IntegerType(), True),
    StructField("provider_id", StringType(), True),
    StructField("provider_name", StringType(), True),
    StructField("specialty", StringType(), True),
    StructField("state", StringType(), True),
    StructField("start_date", DateType(), True),
    StructField("end_date", DateType(), True),
    StructField("is_current", BooleanType(), True)
])

# New data representing the change for Dr. Smith
update_data = [(3, 'P101', 'Dr. Smith', 'Cardiology', 'CA', datetime.strptime('2024-06-15', '%Y-%m-%d').date(), None, True)]

# Create DataFrame with the specified schema
updates_df = spark.createDataFrame(update_data, schema)
updates_df.createOrReplaceTempView("provider_updates")

In [0]:
def implement_provider_scd1(target_table, source_df, key_column="provider_id"):
    """
    Implement SCD Type 1 for provider table - practice location changes
    
    This overwrites:
    - Practice location changes (current location is sufficient)
    - Contact information updates  
    - Administrative corrections
    
    This preserves:
    - Provider specialty (might need history)
    - PCP status (might need history)
    """
    
    print("=== PROVIDER SCD TYPE 1 IMPLEMENTATION ===")
    
    # Read current provider table
    current_providers = spark.table(target_table)
    print("Current provider table:")
    current_providers.show()
    
    print("Provider location changes to apply:")
    source_df.show()
    
    # Identify what columns to update (SCD Type 1 columns)
    scd1_columns = ["practice_location", "name"]  # Location and name corrections
    
    # Show before state
    print("=== BEFORE CHANGES ===")
    before_changes = current_providers.filter(
        col(key_column).isin([row[key_column] for row in source_df.collect()])
    )
    before_changes.show()
    
    # For SCD Type 1, we simply merge/upsert the changes
    # Get records that are not being updated
    unchanged_providers = current_providers.join(
        source_df.select(key_column),
        key_column,
        "left_anti"
    )
    
    # Combine unchanged records with all source records (updated + new)
    updated_providers = unchanged_providers.union(source_df)
    
    # Alternative approach using Delta merge (if available)
    try:
        from delta.tables import DeltaTable
        
        # Try Delta Lake merge for better performance
        delta_table = DeltaTable.forName(spark, target_table)
        
        # Build update dictionary for SCD1 columns
        update_dict = {col: f"source.{col}" for col in scd1_columns}
        update_dict["Provider_Plan_ID"] = "source.Provider_Plan_ID"
        update_dict["specialty"] = "source.specialty"
        update_dict["is_pcp"] = "source.is_pcp"
        update_dict["ID"] = "source.ID"
        update_dict["TaxID"] = "source.TaxID"
        
        delta_table.alias("target") \
            .merge(
                source_df.alias("source"),
                f"target.{key_column} = source.{key_column}"
            ) \
            .whenMatchedUpdate(set=update_dict) \
            .whenNotMatchedInsertAll() \
            .execute()
        
        print("Used Delta Lake merge for SCD Type 1")
        result_df = spark.table(target_table)
        
    except Exception as e:
        print(f"Delta merge not available, using DataFrame operations: {e}")
        
        # Save using DataFrame operations
        updated_providers.write.mode("overwrite").saveAsTable(f"{target_table}_scd1_updated")
        result_df = spark.table(f"{target_table}_scd1_updated")
    
    # Show after state
    print("=== AFTER CHANGES ===")
    after_changes = result_df.filter(
        col(key_column).isin([row[key_column] for row in source_df.collect()])
    )
    after_changes.show()
    
    # Show change summary
    print("=== CHANGE SUMMARY ===")
    total_providers = result_df.count()
    updated_count = source_df.count()
    new_providers = source_df.join(current_providers.select(key_column), key_column, "left_anti").count()
    
    print(f"Total providers after update: {total_providers}")
    print(f"Records processed: {updated_count}")
    print(f"New providers added: {new_providers}")
    print(f"Existing providers updated: {updated_count - new_providers}")
    
    return result_df

# Create realistic provider location changes
provider_changes = [
    # Dr. James Wilson moved to new location
    (2001, "Dr. James Wilson", "Family Medicine", "Downtown Medical Center", 1, "PR0001", 1, 957974335),
    
    # Dr. Sarah Johnson name correction and location change  
    (2002, "Dr. Sarah M. Johnson", "Pediatrics", "Children's Hospital East Wing", 1, "PR0002", 2, 123456789),
    
    # Existing provider location change
    (2003, "Dr. Michael Chen", "Cardiology", "Heart & Vascular Institute", 0, "PR0003", 3, 987654321),
    
    # New provider joining the network
    (2004, "Dr. Lisa Rodriguez", "Orthopedics", "Sports Medicine Center", 0, "PR0004", 4, 555666777),
    
    # Provider location consolidation
    (2005, "Dr. Robert Kim", "Internal Medicine", "North Clinic", 1, "PR0005", 5, 111222333)
]

provider_schema = StructType([
    StructField("Provider_Plan_ID", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("specialty", StringType(), True),
    StructField("practice_location", StringType(), True),
    StructField("is_pcp", IntegerType(), True),
    StructField("provider_id", StringType(), True),
    StructField("ID", IntegerType(), True),
    StructField("TaxID", IntegerType(), True)
])

provider_updates = spark.createDataFrame(provider_changes, provider_schema)

print("=== PROVIDER LOCATION CHANGES TO PROCESS ===")
provider_updates.show(truncate=False)



In [0]:
# Analyze provider location changes over time
def analyze_provider_changes():
    """
    Analyze patterns in provider location changes
    """
    
    print("=== PROVIDER CHANGE ANALYTICS ===")
    
    # Group providers by location to see consolidation patterns
    location_analysis = spark.sql("""
        SELECT 
            practice_location,
            COUNT(*) as provider_count,
            COUNT(CASE WHEN is_pcp = 1 THEN 1 END) as pcp_count,
            COLLECT_LIST(specialty) as specialties
        FROM default.providers
        GROUP BY practice_location
        ORDER BY provider_count DESC
    """)
    
    print("Provider distribution by location:")
    location_analysis.show(truncate=False)
    
    # Specialty distribution
    specialty_analysis = spark.sql("""
        SELECT 
            specialty,
            COUNT(*) as provider_count,
            COUNT(CASE WHEN is_pcp = 1 THEN 1 END) as pcp_count,
            COUNT(DISTINCT practice_location) as locations
        FROM default.providers
        GROUP BY specialty
        ORDER BY provider_count DESC
    """)
    
    print("Provider distribution by specialty:")
    specialty_analysis.show()
    
    return location_analysis, specialty_analysis

# Run provider analytics
location_stats, specialty_stats = analyze_provider_changes()

# Validate provider data quality after SCD1 updates
def validate_provider_data_quality():
    """
    Validate provider data quality after updates
    """
    print("=== PROVIDER DATA QUALITY VALIDATION ===")
    
    providers_df = spark.table("default.providers")
    
    # Check for duplicates
    duplicate_providers = providers_df.groupBy("provider_id").count().filter(col("count") > 1)
    print(f"Duplicate provider IDs: {duplicate_providers.count()}")
    
    # Check for missing required fields
    missing_data = providers_df.filter(
        col("name").isNull() | 
        col("specialty").isNull() | 
        col("practice_location").isNull() |
        col("provider_id").isNull()
    )
    print(f"Records with missing required data: {missing_data.count()}")
    
    # Check PCP distribution
    pcp_distribution = providers_df.groupBy("is_pcp").count()
    print("PCP distribution:")
    pcp_distribution.show()
    
    if missing_data.count() > 0:
        print("Records with missing data:")
        missing_data.show()

# Run validation
validate_provider_data_quality()

In [0]:
from pyspark.sql.functions import count, sum, avg, countDistinct

def analyze_claims_with_scd_dimensions():
    """
    Demonstrate how to use SCD2 eligibility and SCD1 provider data for claims analysis
    """
    
    print("=== CLAIMS ANALYSIS WITH SCD DIMENSIONS ===")
    
    # Join claims with current eligibility (SCD2) and current provider info (SCD1)
    claims_with_dimensions = spark.sql("""
        SELECT 
            c.claim_id,
            c.member_id,
            try_cast(c.billed_amount AS DOUBLE) as billed_amount,
            try_cast(c.paid_amount AS DOUBLE) as paid_amount,
            c.date_of_service,
            c.Type_of_Service,
            
            -- Current eligibility status (SCD2)
            e.eligibility_status,
            e.eligibility_start_date,
            e.eligibility_end_date,
            
            -- Current provider info (SCD1)
            p.name as provider_name,
            p.specialty as provider_specialty,
            p.practice_location,
            p.is_pcp
            
        FROM default.claims c
        
        -- Join with current eligibility (SCD2 - point in time)
        LEFT JOIN healthanalytics.eligibility e
            ON c.member_id = try_cast(e.member_id AS STRING)
            AND c.date_of_service >= e.eligibility_start_date
            AND c.date_of_service <= e.eligibility_end_date
        
        -- Join with current provider info (SCD1 - latest version)
        LEFT JOIN default.providers p
            ON c.ProviderID = p.provider_id
        
        WHERE c.date_of_service >= '2024-01-01'
    """)
    
    print("Claims with SCD dimension data:")
    display(claims_with_dimensions)
    
    # Analyze claims by eligibility status
    eligibility_impact = claims_with_dimensions.groupBy("eligibility_status", "Type_of_Service") \
        .agg(
            count("*").alias("claim_count"),
            sum("billed_amount").alias("total_billed"),
            avg("billed_amount").alias("avg_billed")
        ).orderBy("eligibility_status", "Type_of_Service")
    
    print("Claims analysis by eligibility status:")
    display(eligibility_impact)
    
    # Analyze claims by provider location (after SCD1 updates)
    location_impact = claims_with_dimensions.groupBy("practice_location", "provider_specialty") \
        .agg(
            count("*").alias("claim_count"),
            sum("billed_amount").alias("total_billed"),
            countDistinct("member_id").alias("unique_patients")
        ).orderBy("total_billed", ascending=False)
    
    print("Claims analysis by provider location:")
    display(location_impact)
    
    return claims_with_dimensions

# Run integrated analysis
integrated_analysis = analyze_claims_with_scd_dimensions()