In [0]:
from pyspark.sql.functions import col, expr, explode, coalesce, array, current_timestamp, lit
from pyspark.sql.types import StructType, StructField, StringType

In [0]:
def dot_to_variant_path(dot_path):
    return dot_path.replace(".", ":")

In [0]:
def extract_section_entries(df, variant_col, section_title, section_loinc_code=None, sub_array_path=None):
    sections_df = df.selectExpr(
        f"""explode(
            coalesce(
                try_cast({variant_col}:component:structuredBody:component AS ARRAY<VARIANT>),
                array({variant_col}:component:structuredBody:component)
            )
        ) as _section"""
    )

    # 2. Filter by section code or title
    # filtered_df = sections_df.filter(
    #     expr(f"_section:section:code:_code::string = '{section_loinc_code}'")
    # )
    filtered_df = sections_df.filter(
        expr(f"_section:section:title::string = '{section_title}'")
    )
    # 3. Explode the Entries inside the found section
    entries_df = filtered_df.selectExpr(
        f"'{section_title}' as section_title",
        # Robust explode for entries
        """explode(
            coalesce(
                try_cast(_section:section:entry AS ARRAY<VARIANT>),
                array(_section:section:entry)
            )
        ) as _entry"""
    )

    # 4. Optional Drill Down (e.g. into Organizer -> Component)
    if sub_array_path:
        sub_variant_path = dot_to_variant_path(sub_array_path)
        entries_df = entries_df.selectExpr(
            "section_title",
            # Robust explode for sub-entries
            f"""explode(
                coalesce(
                    try_cast(_entry:{sub_variant_path} AS ARRAY<VARIANT>),
                    array(_entry:{sub_variant_path})
                )
            ) as _entry"""
        )

    return entries_df



In [0]:
def extract_fields(entries_df, field_mappings):
    exprs = ["section_title"]
    for target_column, source_json_path, target_data_type, transformation_sql in field_mappings:
        variant_path = dot_to_variant_path(source_json_path)
        
        # Build the extraction expression (e.g., _entry:observation:id::_root::STRING)
        raw_expr = f"_entry:{variant_path}::{target_data_type}"
        
        if transformation_sql:
            # Apply custom SQL transform if provided
            final_expr = transformation_sql.replace("{value}", raw_expr) + f" AS {target_column}"
        else:
            final_expr = f"{raw_expr} AS {target_column}"
        exprs.append(final_expr)
        
    return entries_df.selectExpr(*exprs)

In [0]:
def parse_ccda(spark, json_string, section_title, field_mappings, sub_array_path=None):
    # 1. Create DataFrame from raw string to prevent schema inference errors
    data = [(json_string,)]
    raw_df = spark.createDataFrame(data, ["raw_json_blob"])
    
    # 2. Parse into VARIANT type
    variant_df = raw_df.selectExpr("parse_json(raw_json_blob) AS doc")

    # 3. Extract and Flatten
    entries_df = extract_section_entries(variant_df, "doc", section_title, sub_array_path=sub_array_path)
    result_df = extract_fields(entries_df, field_mappings)
    return result_df

In [0]:
# Read the file
json_blob = open("/Volumes/workspace/default/files/brnz_ccda_raw_varient.json").read()

# Expanded Mappings for all Observation elements
results_mappings = [
  # --- Identifiers ---
  ("OBSERVATION_ID",    "observation.id._root",  "STRING", None),
  ("OBSERVATION_MOOD_CODE", "observation.moodCode", "STRING", None),
  ("OBSERVATION_CLASS_CODE", "observation.classCode", "STRING", None),
  
  # --- Test Details ---
  ("OBSERVATION_CODE",              "observation.code._code",             "STRING", None),
  ("OBSERVATION_CODE_SYSTEM",       "observation.code._codeSystem",        "STRING", None),
  ("OBSERVATION_DISPLAY_NAME",      "observation.code._displayName",       "STRING", None),
  ("OBSERVATION_TEST_ORIG_TEXT",    "observation.code.originalText",        "STRING", None),
  
  # --- Results (Numeric vs Text) ---
  ("OBSERVATION_VAL_NUMERIC",       "observation.value.translation._value",       "STRING", None),
  ("OBSERVATION_VAL_TEXT",          "observation.value._VALUE",                   "STRING", None),
  ("OBSERVATION_UNIT",              "observation.value.translation.originalText", "STRING", None),
  
  # --- Context ---
  ("OBSERVATION_INTERPRETATION",    "observation.interpretationCode._displayName","STRING", None),
  ("OBSERVATION_REF_RANGE",         "observation.referenceRange.observationRange.text", "STRING", None),
  ("OBSERVATION_STATUS",            "observation.statusCode._code",               "STRING", None),
  ("OBSERVATION_EFFECTIVE_TIME",    "observation.effectiveTime._value",           "STRING", None),

  #---------Template ID---------
  ("OBSERVATION_TEMPLATE_ID",       "observation.templateId._root",             "STRING", None),
  ("OBSERVATION_TEMPLATE_ID_EXT",   "observation.templateId._extension",        "STRING", None)
  
 ]

# Run the parser
# Note: section_loinc_code is ignored in the function now, so we pass "Results" as the title
df = parse_ccda(
    spark, 
    json_blob, 
    section_title="Results",
    field_mappings=results_mappings, 
    sub_array_path="organizer.component"
)

display(df)