In [None]:
from pyspark.sql.functions import col, expr, explode, coalesce, array, current_timestamp, lit
from pyspark.sql.types import StructType, StructField, StringType

In [None]:
def dot_to_variant_path(dot_path):
    return dot_path.replace(".", ":")

In [None]:
def extract_section_entries(df, variant_col, section_loinc_code, sub_array_path=None):
    sections_df = df.selectExpr(
        f"explode({variant_col}:component:structuredBody:component) as _section"
    )

    filtered_df = sections_df.filter(
        expr(f"_section:section:code:_code::string = '{section_loinc_code}'")
    )

    entries_df = filtered_df.selectExpr(
        f"'{section_loinc_code}' as _section_code",
        f"_section:section:title::string as _section_title",
        """explode(
            coalesce(
                try_cast(_section:section:entry AS ARRAY<VARIANT>),
                array(_section:section:entry)
            )
        ) as _entry"""
    )

    if sub_array_path:
        sub_variant_path = dot_to_variant_path(sub_array_path)
        entries_df = entries_df.selectExpr(
            "_section_code",
            "_section_title",
            f"""explode(
                coalesce(
                    try_cast(_entry:{sub_variant_path} AS ARRAY<VARIANT>),
                    array(_entry:{sub_variant_path})
                )
            ) as _entry"""
        )

    return entries_df

In [None]:
def extract_fields(entries_df, field_mappings):
    exprs = ["_section_code", "_section_title"]
    for target_column, source_json_path, target_data_type, transformation_sql in field_mappings:
        variant_path = dot_to_variant_path(source_json_path)
        raw_expr = f"_entry:{variant_path}::{target_data_type}"
        if transformation_sql:
            final_expr = transformation_sql.replace("{value}", raw_expr) + f" AS {target_column}"
        else:
            final_expr = f"{raw_expr} AS {target_column}"
        exprs.append(final_expr)
    return entries_df.selectExpr(*exprs)

In [None]:
def parse_ccda(spark, json_string, section_loinc_code, field_mappings, sub_array_path=None):
    raw_df = spark.read.json(spark.sparkContext.parallelize([json_string]))
    raw_df.createOrReplaceTempView("_ccda_raw")
    variant_df = spark.sql("SELECT parse_json(to_json(struct(*))) AS doc FROM _ccda_raw")

    entries_df = extract_section_entries(variant_df, "doc", section_loinc_code, sub_array_path)
    result_df = extract_fields(entries_df, field_mappings)
    return result_df

In [None]:
# --- Usage ---
# json_blob = open("/path/to/ccda.json").read()
#
# field_mappings is a list of tuples:
#   (target_column, source_json_path, target_data_type, transformation_sql_or_None)
#
# results_mappings = [
#     ("OBSERVATION_ID",    "observation.id._root",                       "STRING", None),
#     ("VALUE_NUMERIC",     "observation.value.translation._value",       "STRING", None),
#     ("VALUE_TEXT",         "observation.value._VALUE",                   "STRING", None),
#     ("UNIT_UCUM_CODE",    "observation.value.translation.originalText", "STRING", None),
#     ("EFFECTIVE_TMSTP",   "observation.effectiveTime.low._value",       "STRING", None),
# ]
#
# df = parse_ccda(spark, json_blob, "30954-2", results_mappings, sub_array_path="organizer.component")
# df.show(truncate=False)