In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
dbutils.widgets.text("company_id", "")
dbutils.widgets.text("snapshot_at", "") 

company_id  = dbutils.widgets.get("company_id").strip()
snapshot_at = dbutils.widgets.get("snapshot_at").strip()
 
if not company_id:
    raise ValueError("Missing required param: company_id")
if not snapshot_at:
    raise ValueError("Missing required param: snapshot_at (UTC boundary like 2025-12-17T00:00:00Z)")

spark.conf.set("spark.sql.session.timeZone", "UTC")

asff_tbl   = f"cloudfastener.{company_id}.aws_securityhub_findings_1_0"
ocsf_tbl   = f"cloudfastener.{company_id}.aws_securitylake_sh_findings_2_0"
silver_tbl = f"cloudfastener.{company_id}.silver_aws_compliance_findings"

snapshot_at = F.date_trunc(
    "DAY",
    F.to_timestamp(F.lit(snapshot_at)))

window_end_ts   = F.to_timestamp(F.lit(snapshot_at))
window_start_ts = snapshot_at - F.expr("INTERVAL 1 DAY")

print("============================================================")
print("Bronze → Silver job starting")
print(f"company_id   = {company_id}")
print(f"snapshot_at  = {snapshot_at} (UTC logical boundary)")
print(f"window_start = snapshot_at - 1 day")
print(f"ASFF bronze  = {asff_tbl}")
print(f"OCSF bronze  = {ocsf_tbl}")
print(f"Silver       = {silver_tbl}")
print("============================================================")


Bronze → Silver job starting
company_id   = xs22xw4aw73q
snapshot_at  = Column<'date_trunc(DAY, to_timestamp(2025-12-17T00:00:00Z))'> (UTC logical boundary)
window_start = snapshot_at - 1 day
ASFF bronze  = cloudfastener.xs22xw4aw73q.aws_securityhub_findings_1_0
OCSF bronze  = cloudfastener.xs22xw4aw73q.aws_securitylake_sh_findings_2_0
Silver       = cloudfastener.xs22xw4aw73q.silver_aws_compliance_findings


In [0]:
def table_exists(full_name: str) -> bool:
    try:
        return spark.catalog.tableExists(full_name)
    except Exception:
        return False

def normalize_finding_id(col):
    # trim, convert empty to NULL
    return F.when(F.length(F.trim(col)) == 0, F.lit(None)).otherwise(F.trim(col))

def parse_iso8601_to_ts(col):
    # Works for ISO8601 strings like "...Z"
    return F.to_timestamp(col)


In [0]:
asff_exists = table_exists(asff_tbl)
ocsf_exists = table_exists(ocsf_tbl)

print(f"ASFF exists? {asff_exists}")
print(f"OCSF exists? {ocsf_exists}")

if not asff_exists and not ocsf_exists:
    raise RuntimeError("Neither bronze table exists; cannot build silver for this company.")

sources = []

if asff_exists:
    df_asff_raw = (
        spark.table(asff_tbl)
        .where(
            (F.col("product_name") == "Security Hub") &(F.col("cf_processed_time") >= window_start_ts) &
            (F.col("cf_processed_time") <  window_end_ts))
    )
    asff_count = df_asff_raw.count()
    print(f"ASFF rows in window: {asff_count}")
    if asff_count > 0:
        sources.append(("ASFF", df_asff_raw))

if ocsf_exists:
    df_ocsf_raw = (
        spark.table(ocsf_tbl)
        .where(
            (F.col("metadata.product.name") == "Security Hub") &
            (F.col("cf_processed_time") >= window_start_ts) 
            &
               (F.col("cf_processed_time") <  window_end_ts))
    )
    ocsf_count = df_ocsf_raw.count()
    print(f"OCSF rows in window: {ocsf_count}")
    if ocsf_count > 0:
        sources.append(("OCSF", df_ocsf_raw))

if len(sources) == 0:
    print("No rows found in the window for existing sources. Nothing to merge.")
    dbutils.notebook.exit("EMPTY_WINDOW")


ASFF exists? True
OCSF exists? True
ASFF rows in window: 0
OCSF rows in window: 65316


In [0]:
from pyspark.sql import functions as F

def transform_asff(df):
    return (
        df.select(
            # identity
            F.lit(company_id).alias("company_id"),
            F.lit("ASFF").alias("finding_source"),

            # key
            normalize_finding_id(F.col("finding_id")).alias("finding_id"),

            # times (ASFF columns are STRING in your schema dump)
            parse_iso8601_to_ts(F.col("created_at")).alias("finding_created_time"),
            parse_iso8601_to_ts(F.col("updated_at")).alias("finding_modified_time"),

            # status (normalize ASFF workflow statuses into the OCSF-style set)
            F.when(F.upper(F.col("workflow.Status")) == "NEW", "New")
             .when(F.upper(F.col("workflow.Status")) == "NOTIFIED", "In Progress")
             .when(F.upper(F.col("workflow.Status")) == "SUPPRESSED", "Suppressed")
             .when(F.upper(F.col("workflow.Status")) == "RESOLVED", "Resolved")
             .otherwise(F.col("workflow.Status"))
             .alias("finding_status"),

            # account/region
            F.col("aws_account_id").cast("string").alias("account_id"),
            F.col("finding_region").cast("string").alias("region_id"),

            # compliance / control mapping (correct for your ASFF schema)
            F.expr("compliance.AssociatedStandards[0].StandardsId").cast("string").alias("standard_id"),
            F.col("compliance.SecurityControlId").cast("string").alias("control_id"),
            F.col("compliance.Status").cast("string").alias("compliance_status"),

            # control title
            F.col("title").cast("string").alias("control_title"),

            # severity (struct<Label,Normalized,Original>)
            F.col("severity.Label").cast("string").alias("severity"),
            F.lit(None).cast("string").alias("rule_severity"),

            # resource (array<struct<Type,Id,...>>; note the capitalized keys)
            F.expr("resources[0].Id").cast("string").alias("resource_id"),
            F.expr("resources[0].Type").cast("string").alias("resource_type"),

            # snapshot for this run
            F.to_timestamp(F.lit(snapshot_at)).alias("snapshot_at"),

            # tie-break helpers for dedupe
            F.col("cf_processed_time").alias("_cf_processed_time"),
            F.lit(0).alias("_source_preference")
        )
    )


In [0]:
def transform_ocsf(df):
    return (
        df.select(
            F.lit(company_id).alias("company_id"),
            F.lit("OCSF").alias("finding_source"),

            normalize_finding_id(F.col("finding_info.uid")).alias("finding_id"),

            parse_iso8601_to_ts(F.col("finding_info.created_time_dt")).alias("finding_created_time"),
            parse_iso8601_to_ts(F.col("finding_info.modified_time_dt")).alias("finding_modified_time"),

            F.col("status").cast("string").alias("finding_status"),

            F.col("cloud.account.uid").cast("string").alias("account_id"),
            F.col("cloud.region").cast("string").alias("region_id"),

            F.expr("compliance.standards[0]").cast("string").alias("standard_id"),
            F.col("compliance.control").cast("string").alias("control_id"),
            F.col("compliance.status").cast("string").alias("compliance_status"),

            F.col("finding_info.title").cast("string").alias("control_title"),

            F.col("severity").cast("string").alias("severity"),
            F.lit(None).cast("string").alias("rule_severity"),

            F.coalesce(
                F.expr("resources[0].uid").cast("string"),
                F.col("resource.uid").cast("string"),
                F.col("unmapped").getItem("ProductFields.Resources:0/Id").cast("string")
            ).alias("resource_id"),

            F.coalesce(
                F.expr("resources[0].type").cast("string"),
                F.col("resource.type").cast("string"),
                F.col("unmapped").getItem("ProductFields.Resources:0/Type").cast("string")
            ).alias("resource_type"),

            F.to_timestamp(F.lit(snapshot_at)).alias("snapshot_at"),

            F.col("cf_processed_time").alias("_cf_processed_time"),
            F.lit(1).alias("_source_preference")
        )
    )


In [0]:
canonical_dfs = []

for src, dfraw in sources:
    if src == "ASFF":
        out = transform_asff(dfraw)
    elif src == "OCSF":
        out = transform_ocsf(dfraw)
    else:
        continue

    if out is None:
        raise RuntimeError(f"Transform returned None for source={src}. Check function definitions.")

    # Keep only valid keys early; it reduces later work
    out = out.withColumn("finding_id", normalize_finding_id(F.col("finding_id"))) \
             .where(F.col("finding_id").isNotNull())

    canonical_dfs.append(out)

if not canonical_dfs:
    print("No canonical rows after filtering (empty window or null keys). Nothing to merge.")
    dbutils.notebook.exit("EMPTY_CANONICAL")

df_union = canonical_dfs[0]
for d in canonical_dfs[1:]:
    df_union = df_union.unionByName(d, allowMissingColumns=True)

print(f"Union rows: {df_union.count()}")


Union rows: 65316


In [0]:
w = Window.partitionBy("company_id", "finding_id").orderBy(
    F.col("finding_modified_time").desc_nulls_last(),
    F.col("_source_preference").desc(),
    F.col("_cf_processed_time").desc_nulls_last()
)

df_winners = (
    df_union
    .withColumn("_rn", F.row_number().over(w))
    .where(F.col("_rn") == 1)
)

winner_count = df_winners.count()
distinct_ids = df_winners.select("finding_id").distinct().count()


Winners rows: 4083
Distinct finding_id in winners: 4083


In [0]:
df_stage = (
    df_winners
    .withColumn("source_preference", F.col("_source_preference"))
    .withColumn("cf_processed_time", F.col("_cf_processed_time"))
    .drop("_rn", "_source_preference", "_cf_processed_time")
)

df_stage.createOrReplaceTempView("stg_winners")

# source_preference: higher wins
# ASFF = 0 (lower priority)
# OCSF = 1 (higher priority)
spark.sql(f"""
    MERGE INTO {silver_tbl} t
    USING stg_winners s
       ON t.company_id = s.company_id 
      AND t.finding_id = s.finding_id

    WHEN MATCHED AND (
           t.finding_modified_time IS NULL
        OR s.finding_modified_time > t.finding_modified_time
        OR (
             s.finding_modified_time = t.finding_modified_time
             AND (CASE WHEN t.finding_source = 'OCSF' THEN 1 ELSE 0 END) < s.source_preference
           )
    ) THEN UPDATE SET
        t.finding_source        = s.finding_source,
        t.finding_created_time  = s.finding_created_time,
        t.finding_modified_time = s.finding_modified_time,
        t.snapshot_at           = s.snapshot_at,
        t.finding_status        = s.finding_status,
        t.account_id            = s.account_id,
        t.region_id             = s.region_id,
        t.standard_id           = s.standard_id,
        t.control_id            = s.control_id,
        t.control_title         = s.control_title,
        t.severity              = s.severity,
        t.rule_severity         = s.rule_severity,
        t.compliance_status     = s.compliance_status,
        t.resource_id           = s.resource_id,
        t.resource_type         = s.resource_type,
        t.updated_at            = current_timestamp()

    WHEN NOT MATCHED THEN INSERT (
        company_id, 
        finding_source, 
        created_at, 
        updated_at, 
        finding_id,
        finding_created_time, 
        finding_modified_time, 
        snapshot_at, 
        finding_status,
        account_id, 
        region_id, 
        standard_id, 
        control_id, 
        control_title, 
        severity,
        rule_severity, 
        compliance_status, 
        resource_id, 
        resource_type
    ) VALUES (
        s.company_id, 
        s.finding_source, 
        current_timestamp(), 
        current_timestamp(), 
        s.finding_id,
        s.finding_created_time, 
        s.finding_modified_time, 
        s.snapshot_at, 
        s.finding_status,
        s.account_id, 
        s.region_id, 
        s.standard_id, 
        s.control_id, 
        s.control_title, 
        s.severity,
        s.rule_severity, 
        s.compliance_status, 
        s.resource_id, 
        s.resource_type
    )
""")

print("MERGE completed.")
print("Job done.")

MERGE completed.
Job done.
