In [None]:
# Import required libraries for config table setup
from pyspark.sql import SparkSession
from datetime import datetime

In [None]:
# Create the document type registry table
# This table stores identification rules for each JSON document format.
# The identifier_path and identifier_value are used to check incoming
# JSON blobs and determine which parser to apply.
spark.sql("""
CREATE TABLE IF NOT EXISTS clinical_document_type (
    document_type_code STRING NOT NULL,
    document_type_name STRING,
    identifier_path STRING,
    identifier_value STRING,
    is_active BOOLEAN,
    date_created TIMESTAMP
) USING DELTA
""")
print("Table clinical_document_type created or already exists.")

In [None]:
# Create the field mapping table
# Each row maps a source JSON path to a target column in a silver table.
# path_context indicates where the path starts:
#   'root'          = path is relative to the document root
#   'section_entry' = path is relative to an exploded section entry
# section_loinc_code identifies which CCDA section to extract from.
# entry_sub_array_path: when set, indicates a nested array within the entry
#   that must be exploded before field extraction. For example, Results and
#   Vital Signs entries contain organizer.component[] arrays. When set,
#   source_json_path is relative to the sub-array item, not the entry.
spark.sql("""
CREATE TABLE IF NOT EXISTS clinical_field_mapping (
    mapping_id INT,
    document_type_code STRING NOT NULL,
    target_table STRING NOT NULL,
    target_column STRING NOT NULL,
    column_ordinal INT,
    source_json_path STRING,
    path_context STRING,
    section_loinc_code STRING,
    entry_sub_array_path STRING,
    target_data_type STRING,
    transformation_sql STRING,
    is_array_field BOOLEAN,
    is_active BOOLEAN,
    date_created TIMESTAMP
) USING DELTA
""")
print("Table clinical_field_mapping created or already exists.")

In [None]:
# Create the processing log and watermark tracking table
# Each completed run records the high_watermark (max insert_timestamp from bronze).
# Subsequent runs only process rows beyond this watermark to prevent reprocessing.
spark.sql("""
CREATE TABLE IF NOT EXISTS clinical_processing_log (
    log_id INT,
    document_type_code STRING,
    target_table STRING,
    bronze_table_name STRING,
    run_start_timestamp TIMESTAMP,
    run_end_timestamp TIMESTAMP,
    records_read INT,
    records_written INT,
    records_failed INT,
    high_watermark TIMESTAMP,
    run_status STRING
) USING DELTA
""")
print("Table clinical_processing_log created or already exists.")

In [None]:
# Seed the CCDA document type
# CCDA documents are identified by the presence of templateId with _root value
# '2.16.840.1.113883.10.20.22.1.1' which is the C-CDA R2.1 header template.
# The identifier check runs against each incoming JSON blob.
spark.sql("""
INSERT INTO clinical_document_type
SELECT * FROM (
    SELECT
        'CCDA' AS document_type_code,
        'Consolidated Clinical Document Architecture' AS document_type_name,
        'templateId' AS identifier_path,
        '2.16.840.1.113883.10.20.22.1.1' AS identifier_value,
        true AS is_active,
        current_timestamp() AS date_created
) src
WHERE NOT EXISTS (
    SELECT 1 FROM clinical_document_type
    WHERE document_type_code = 'CCDA'
)
""")
print("CCDA document type seeded.")

In [None]:
# Seed field mappings for OBSERVATION table - Results section (LOINC 30954-2)
# The Results section uses organizer > component[] > observation structure.
# entry_sub_array_path = 'organizer.component' tells the extractor to explode
# the component array, so paths below are relative to each component item.
# Numeric values in this section use translation._value; text values use _VALUE.
spark.sql("""
INSERT INTO clinical_field_mapping VALUES
    (1, 'CCDA', 'OBSERVATION', 'OBSERVATION_ID', 1,
     'observation.id._root', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (2, 'CCDA', 'OBSERVATION', 'OBSERVATION_STATUS_CODE', 2,
     'observation.statusCode._code', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (3, 'CCDA', 'OBSERVATION', 'OBSERVATION_TYPE_SYSTEM_NAME', 3,
     'observation.code._codeSystemName', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (4, 'CCDA', 'OBSERVATION', 'OBSERVATION_TYPE_SYSTEM_CODE', 4,
     'observation.code._code', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (5, 'CCDA', 'OBSERVATION', 'OBSERVATION_TYPE_DISPLAY_NAME', 5,
     'observation.code._displayName', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (6, 'CCDA', 'OBSERVATION', 'RESULT_VALUE_TYPE', 6,
     'observation.value._xsi:type', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (7, 'CCDA', 'OBSERVATION', 'VALUE_NUMERIC', 7,
     'observation.value.translation._value', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (8, 'CCDA', 'OBSERVATION', 'VALUE_TEXT', 8,
     'observation.value._VALUE', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (9, 'CCDA', 'OBSERVATION', 'UNIT_UCUM_CODE', 9,
     'observation.value.translation.originalText', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (10, 'CCDA', 'OBSERVATION', 'ABNORMAL_IND', 10,
     'observation.interpretationCode._code', 'section_entry', '30954-2', 'organizer.component',
     'STRING',
     'CASE WHEN {value} IN (''H'',''L'',''HH'',''LL'',''A'') THEN true ELSE false END',
     false, true, current_timestamp()),

    (11, 'CCDA', 'OBSERVATION', 'EFFECTIVE_TMSTP', 11,
     'observation.effectiveTime.low._value', 'section_entry', '30954-2', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp())
""")
print("OBSERVATION mappings seeded for Results section (30954-2).")

In [None]:
# Seed field mappings for OBSERVATION table - Vital Signs section (LOINC 8716-3)
# Vital signs use the same organizer > component[] > observation structure as Results.
# entry_sub_array_path = 'organizer.component' triggers sub-item explosion.
# Vital signs values use _value and _unit directly on observation.value.
spark.sql("""
INSERT INTO clinical_field_mapping VALUES
    (20, 'CCDA', 'OBSERVATION', 'OBSERVATION_ID', 1,
     'observation.id._root', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (21, 'CCDA', 'OBSERVATION', 'OBSERVATION_STATUS_CODE', 2,
     'observation.statusCode._code', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (22, 'CCDA', 'OBSERVATION', 'OBSERVATION_TYPE_SYSTEM_NAME', 3,
     'observation.code._codeSystemName', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (23, 'CCDA', 'OBSERVATION', 'OBSERVATION_TYPE_SYSTEM_CODE', 4,
     'observation.code._code', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (24, 'CCDA', 'OBSERVATION', 'OBSERVATION_TYPE_DISPLAY_NAME', 5,
     'observation.code._displayName', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (25, 'CCDA', 'OBSERVATION', 'RESULT_VALUE_TYPE', 6,
     'observation.value._xsi:type', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (26, 'CCDA', 'OBSERVATION', 'VALUE_NUMERIC', 7,
     'observation.value._value', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (27, 'CCDA', 'OBSERVATION', 'VALUE_TEXT', 8,
     'observation.value._VALUE', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (28, 'CCDA', 'OBSERVATION', 'UNIT_UCUM_CODE', 9,
     'observation.value._unit', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp()),

    (29, 'CCDA', 'OBSERVATION', 'EFFECTIVE_TMSTP', 10,
     'observation.effectiveTime._value', 'section_entry', '8716-3', 'organizer.component',
     'STRING', NULL, false, true, current_timestamp())
""")
print("OBSERVATION mappings seeded for Vital Signs section (8716-3).")

In [None]:
# Seed root-level mappings for OBSERVATION table
# These fields come from the document root, not from section entries.
# PATIENT_ID, ORGANIZATION_ID, and PRACTITIONER_ID are extracted from
# top-level document elements and joined to section-level data.
# Root mappings have entry_sub_array_path = NULL since they don't use entries.
spark.sql("""
INSERT INTO clinical_field_mapping VALUES
    (30, 'CCDA', 'OBSERVATION', 'PATIENT_ID', 30,
     'recordTarget.patientRole.id._extension', 'root', NULL, NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (31, 'CCDA', 'OBSERVATION', 'ORGANIZATION_ID', 31,
     'documentationOf.serviceEvent.performer.assignedEntity.representedOrganization.id._root',
     'root', NULL, NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (32, 'CCDA', 'OBSERVATION', 'PRACTITIONER_ID', 32,
     'documentationOf.serviceEvent.performer.assignedEntity.id._extension',
     'root', NULL, NULL,
     'STRING', NULL, false, true, current_timestamp())
""")
print("OBSERVATION root-level mappings seeded.")

In [None]:
# Seed field mappings for ENCOUNTER table - Encounters section (LOINC 46240-8)
# Encounters use a flat entry > encounter structure (no nested array to explode),
# so entry_sub_array_path is NULL. Paths are relative to the entry element.
# Note: effectiveTime in encounters uses _value directly (not low/high split).
spark.sql("""
INSERT INTO clinical_field_mapping VALUES
    (40, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_ID', 1,
     'encounter.id._root', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (41, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_STATUS_CODE', 2,
     'encounter.statusCode._code', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (42, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_SERVICE_TYPE_TEXT', 3,
     'encounter.code.originalText.reference._value', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (43, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_START_TMSTP', 4,
     'encounter.effectiveTime._value', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (44, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_END_TMSTP', 5,
     'encounter.effectiveTime.high._value', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (45, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_PERFORMER_NAME', 6,
     'encounter.performer.assignedEntity.assignedPerson.name.family',
     'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (46, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_PERFORMER_PHONE', 7,
     'encounter.performer.assignedEntity.telecom._value', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (47, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_LOCATION_CITY', 8,
     'encounter.performer.assignedEntity.addr.city', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (48, 'CCDA', 'ENCOUNTER', 'ENCOUNTER_LOCATION_STATE', 9,
     'encounter.performer.assignedEntity.addr.state', 'section_entry', '46240-8', NULL,
     'STRING', NULL, false, true, current_timestamp())
""")
print("ENCOUNTER section mappings seeded for Encounters section (46240-8).")

In [None]:
# Seed root-level mappings for ENCOUNTER table
# Patient demographics come from the document root, not from section entries.
spark.sql("""
INSERT INTO clinical_field_mapping VALUES
    (50, 'CCDA', 'ENCOUNTER', 'PATIENT_ID', 20,
     'recordTarget.patientRole.id._extension', 'root', NULL, NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (51, 'CCDA', 'ENCOUNTER', 'PATIENT_NAME_FAMILY', 21,
     'recordTarget.patientRole.patient.name.family', 'root', NULL, NULL,
     'STRING', NULL, false, true, current_timestamp()),

    (52, 'CCDA', 'ENCOUNTER', 'PATIENT_NAME_GIVEN', 22,
     'recordTarget.patientRole.patient.name.given', 'root', NULL, NULL,
     'STRING', NULL, false, true, current_timestamp())
""")
print("ENCOUNTER root-level mappings seeded.")

In [None]:
# Verify the seeded configuration data
print("--- Document Types ---")
spark.sql("SELECT * FROM clinical_document_type").show(truncate=False)

print("--- Field Mappings Summary (by target table, section, and context) ---")
spark.sql("""
    SELECT document_type_code, target_table, section_loinc_code,
           path_context, count(*) as field_count
    FROM clinical_field_mapping
    WHERE is_active = true
    GROUP BY document_type_code, target_table, section_loinc_code, path_context
    ORDER BY target_table, section_loinc_code
""").show(truncate=False)

print("--- Field Mapping Detail ---")
spark.sql("""
    SELECT mapping_id, target_table, target_column,
           source_json_path, path_context, section_loinc_code
    FROM clinical_field_mapping
    WHERE is_active = true
    ORDER BY target_table, section_loinc_code, column_ordinal
""").show(50, truncate=False)

print("--- Processing Log (should be empty on first run) ---")
spark.sql("SELECT * FROM clinical_processing_log").show(truncate=False)