# Security Hub Standards ETL Pipeline

AWS Security Hub compliance data processing pipeline.

## Configuration & Setup

Load parameters, configure Spark, and define the processing time window.

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import traceback

# Get parameters from job
dbutils.widgets.text("CATALOG_NAME", "")
dbutils.widgets.text("COMPANY_INDEX_ID", "")

catalog_name = dbutils.widgets.get("CATALOG_NAME").strip()
company_index_id_param = dbutils.widgets.get("COMPANY_INDEX_ID").strip()

if not catalog_name:
    raise ValueError("Missing required param: CATALOG_NAME")

# Set timezone
spark.conf.set("spark.sql.session.timeZone", "UTC")

# Calculate job date and processing window
# 48-hour window ensures we capture all findings (Security Hub checks every 18 hours)
job_date = F.date_trunc("DAY", F.current_timestamp())
window_end_ts = job_date
window_start_ts = window_end_ts - F.expr("INTERVAL 48 HOURS")
cf_processed_time = job_date

# Determine processing mode
is_all_companies = not company_index_id_param or company_index_id_param.upper() == "ALL"

print("\n" + "="*80)
print("SECURITY HUB STANDARDS ETL PIPELINE")
print("="*80)
print(f"Catalog:            {catalog_name}")
print(f"Company Mode:       {'Auto-Discovery (ALL)' if is_all_companies else company_index_id_param}")
print(f"Job Date:           {job_date}")
print(f"Time Window:        48 hours (Security Hub check cycle: 18 hours)")
print(f"Retention Strategy: 1-day (TRUNCATE + Append)")
print("="*80 + "\n")

## Utility Functions

Helper functions for company ID validation, table existence checks, and company discovery.

In [0]:
def is_valid_company_id(schema_name: str) -> bool:
    """Check if schema name matches company ID format: 12 chars, lowercase alphanumeric."""
    return (
        len(schema_name) == 12 and
        schema_name.isalnum() and
        schema_name.islower()
    )

def table_exists(full_name: str) -> bool:
    """Check if a table exists in the catalog."""
    try:
        return spark.catalog.tableExists(full_name)
    except Exception:
        return False

In [0]:
def discover_companies(catalog: str) -> list:
    """Discover all company schemas in the catalog."""
    try:
        # Unity Catalog: need to list schemas within the catalog
        print(f"[DEBUG] Searching for companies in catalog: {catalog}")

        # Set current catalog and list schemas
        spark.sql(f"USE CATALOG {catalog}")
        schemas = spark.catalog.listDatabases()

        print(f"[DEBUG] Total schemas found in {catalog}: {len(schemas)}")

        companies = []
        for schema in schemas:
            schema_name = schema.name
            print(f"[DEBUG] Checking schema: {schema_name}")

            # In Unity Catalog, schema.name is just the schema name (not catalog.schema)
            # But might still have catalog prefix in some cases
            if '.' in schema_name:
                # Handle "catalog.schema" format
                parts = schema_name.split('.')
                if parts[0] == catalog and len(parts) == 2:
                    schema_name = parts[1]
                    print(f"[DEBUG]   -> Extracted schema: {schema_name}")
                else:
                    print(f"[DEBUG]   -> Skipped (unexpected format: {schema_name})")
                    continue

            if is_valid_company_id(schema_name):
                print(f"[DEBUG]   -> ✓ Valid company ID: {schema_name}")
                companies.append(schema_name)
            else:
                print(f"[DEBUG]   -> ✗ Invalid company ID format: {schema_name} (len={len(schema_name)}, alnum={schema_name.isalnum()}, lower={schema_name.islower()})")

        return sorted(companies)
    except Exception as e:
        print(f"Error discovering companies: {e}")
        traceback.print_exc()
        return []

In [0]:
def normalize_finding_id(col):
    """Normalize finding ID: trim and convert empty to NULL."""
    return F.when(F.length(F.trim(col)) == 0, F.lit(None)).otherwise(F.trim(col))

def parse_iso8601_to_ts(col):
    """Parse ISO8601 timestamp string to Spark timestamp."""
    return F.to_timestamp(col)

## Load Security Hub Controls Reference

Load the reference table mapping control IDs to correct severity levels.

In [0]:
# ============================================================
# LOAD SECURITY HUB CONTROLS REFERENCE TABLE
# ============================================================

# Load the reference table with correct control_id -> severity mappings
controls_ref_table = f"{catalog_name}.reference.securityhub_controls"

try:
    if table_exists(controls_ref_table):
        controls_ref_df = spark.table(controls_ref_table).select("control_id", "severity")
        ref_count = controls_ref_df.count()
        print(f"[INFO] Loaded Security Hub controls reference table: {ref_count} mappings")
    else:
        print(f"[WARN] Reference table {controls_ref_table} not found. Severity will be taken from source data.")
        controls_ref_df = None
except Exception as e:
    print(f"[WARN] Could not load reference table: {e}. Severity will be taken from source data.")
    controls_ref_df = None

## Data Transformation Functions

Transform ASFF and OCSF formats to canonical schema, excluding archived findings.

In [0]:
def transform_asff(df):
    """
    Transform ASFF (AWS Security Finding Format) to canonical schema.
    Excludes ARCHIVED findings and normalizes fields.
    Preserves original workflow status values: NEW, NOTIFIED, SUPPRESSED, RESOLVED.
    """
    return (
        df
        .where(F.col("RecordState") != "ARCHIVED")
        .select(
            normalize_finding_id(F.col("finding_id")).alias("finding_id"),
            parse_iso8601_to_ts(F.col("updated_at")).alias("finding_modified_time"),
            F.upper(F.col("workflow.Status")).alias("finding_status"),
            F.col("aws_account_id").cast("string").alias("account_id"),
            F.col("finding_region").cast("string").alias("region_id"),
            F.expr("compliance.AssociatedStandards[0].StandardsId").cast("string").alias("standard_id"),
            F.col("compliance.SecurityControlId").cast("string").alias("control_id"),
            F.col("compliance.Status").cast("string").alias("compliance_status"),
            F.col("severity.Label").cast("string").alias("severity"),
            F.col("cf_processed_time").alias("_bronze_processed_time"),
            F.lit(1).alias("_preference")  # ASFF preferred over OCSF
        )
    )

In [0]:
def transform_ocsf(df):
    """
    Transform OCSF (Open Cybersecurity Schema Framework) to canonical schema.
    Excludes ARCHIVED findings and normalizes fields.
    Maps workflow status to match ASFF format: NEW, NOTIFIED, SUPPRESSED, RESOLVED.
    """
    return (
        df
        .where(F.col("unmapped.RecordState") != "ARCHIVED")
        .select(
            normalize_finding_id(F.col("finding_info.uid")).alias("finding_id"),
            parse_iso8601_to_ts(F.col("finding_info.modified_time_dt")).alias("finding_modified_time"),
            # Map OCSF workflow states to match ASFF uppercase format
            F.when(F.col("unmapped.WorkflowState").isNotNull(), F.upper(F.col("unmapped.WorkflowState")))
             .when(F.upper(F.col("status").cast("string")) == "IN_PROGRESS", "NOTIFIED")
             .otherwise(F.upper(F.col("status").cast("string")))
             .alias("finding_status"),
            F.col("cloud.account.uid").cast("string").alias("account_id"),
            F.col("cloud.region").cast("string").alias("region_id"),
            F.expr("compliance.standards[0]").cast("string").alias("standard_id"),
            F.col("compliance.control").cast("string").alias("control_id"),
            F.col("compliance.status").cast("string").alias("compliance_status"),
            F.col("severity").cast("string").alias("severity"),
            F.col("cf_processed_time").alias("_bronze_processed_time"),
            F.lit(0).alias("_preference")  # OCSF fallback
        )
    )

## Aggregation Functions

Aggregate findings to control-level and account/region summaries with compliance scores.

In [0]:
def aggregate_findings_to_controls(findings_df):
    """
    Aggregate findings to control-level status.
    AWS Security Hub CSPM-compliant aggregation logic.
    """
    # Normalize compliance status and severity
    findings = (
        findings_df
        .withColumn("compliance_status", F.upper("compliance_status"))
        .withColumn(
            "severity",
            F.when(F.col("severity").isNull(), "unclassified")
             .otherwise(F.lower("severity"))
        )
        .withColumn(
            "is_suppressed",
            F.upper(F.col("finding_status")) == F.lit("SUPPRESSED")
        )
        .withColumn(
            "severity_rank",
            F.when(F.col("severity") == "critical", 4)
             .when(F.col("severity") == "high", 3)
             .when(F.col("severity") == "medium", 2)
             .when(F.col("severity") == "low", 1)
             .otherwise(0)
        )
    )

    # Control-level aggregation
    control_key = ["account_id", "region_id", "standard_id", "control_id"]

    controls = (
        findings
        .groupBy(*control_key)
        .agg(
            # Count-based aggregation (CSPM-compliant)
            F.sum(F.when(~F.col("is_suppressed"), 1).otherwise(0)).alias("active_cnt"),
            F.sum(F.when((~F.col("is_suppressed")) & (F.col("compliance_status") == "FAILED"), 1).otherwise(0)).alias("failed_cnt"),
            F.sum(F.when((~F.col("is_suppressed")) & (F.col("compliance_status") == "PASSED"), 1).otherwise(0)).alias("passed_cnt"),
            F.sum(F.when((~F.col("is_suppressed")) & (F.col("compliance_status").isin("WARNING", "NOT_AVAILABLE")), 1).otherwise(0)).alias("unknown_cnt"),
            F.count("*").alias("total_cnt"),
            F.max("severity_rank").alias("max_severity_rank")
        )
        .withColumn(
            "control_status",
            F.when(F.col("active_cnt") == 0, "NO_DATA")
             .when(F.col("failed_cnt") > 0, "FAILED")
             .when(F.col("unknown_cnt") > 0, "UNKNOWN")
             .when(F.col("passed_cnt") == F.col("active_cnt"), "PASSED")
             .otherwise("UNKNOWN")
        )
        .withColumn(
            "severity",
            F.when(F.col("max_severity_rank") == 4, "critical")
             .when(F.col("max_severity_rank") == 3, "high")
             .when(F.col("max_severity_rank") == 2, "medium")
             .when(F.col("max_severity_rank") == 1, "low")
             .otherwise("unclassified")
        )
        .drop("max_severity_rank")
    )

    return controls

In [0]:
def aggregate_account_region_summary(controls_df, company_id, cf_processed_time):
    """
    Aggregate control-level data to account/region summary.
    Includes per-standard and per-severity breakdowns.
    """
    std_key = ["account_id", "region_id", "standard_id"]

    # Severity-level aggregation
    severity_agg = (
        controls_df
        .groupBy(*std_key, "severity")
        .agg(
            F.countDistinct("control_id").alias("total"),
            F.sum(F.when(F.col("control_status") == "PASSED", 1).otherwise(0)).cast("int").alias("passed")
        )
        .withColumn(
            "score",
            F.round(
                F.when(F.col("total") > 0, F.col("passed") * 100.0 / F.col("total"))
                 .otherwise(0.0),
                2
            )
        )
    )

    # Standard-level aggregation
    standards = (
        severity_agg
        .groupBy(*std_key)
        .agg(
            F.sum("total").alias("total"),
            F.sum("passed").alias("passed"),
            F.collect_list(
                F.struct(
                    F.col("severity").alias("level"),
                    "score",
                    F.struct("total", "passed").alias("controls")
                )
            ).alias("controls_by_severity")
        )
        .withColumn(
            "score",
            F.round(
                F.when(F.col("total") > 0, F.col("passed") * 100.0 / F.col("total"))
                 .otherwise(0.0),
                2
            )
        )
        .select(
            *std_key,
            F.struct(
                F.col("standard_id").alias("std"),
                "score",
                F.struct("total", "passed").alias("controls"),
                "controls_by_severity"
            ).alias("standard_summary")
        )
    )

    # Account/region summary
    region_key = ["account_id", "region_id"]

    overall = (
        controls_df
        .groupBy(*region_key)
        .agg(
            F.countDistinct(F.struct("standard_id", "control_id")).alias("total_rules"),
            F.sum(F.when(F.col("control_status") == "PASSED", 1).otherwise(0)).cast("int").alias("total_passed")
        )
        .withColumn(
            "control_pass_score",
            F.round(
                F.when(F.col("total_rules") > 0, (F.col("total_passed") / F.col("total_rules")) * 100)
                 .otherwise(0.0),
                2
            )
        )
    )

    # Join standards summary
    standards_summary_df = (
        overall
        .join(
            standards.groupBy(*region_key)
                     .agg(F.collect_list("standard_summary").alias("standards_summary")),
            region_key
        )
        .withColumn("cf_processed_time", F.to_timestamp(F.lit(cf_processed_time)))
        .withColumn("company_id", F.lit(company_id))
        .select(
            "company_id",
            "cf_processed_time",
            "account_id",
            "region_id",
            "control_pass_score",
            "total_rules",
            "total_passed",
            "standards_summary"
        )
    )

    return standards_summary_df

## Company Discovery

Determine which companies to process based on job parameters.

In [0]:
# Determine companies to process
if not company_index_id_param or company_index_id_param.upper() == "ALL":
    companies_to_process = discover_companies(catalog_name)
    print(f"\n[INFO] Auto-discovery mode: Found {len(companies_to_process)} companies")
    if companies_to_process:
        if len(companies_to_process) <= 10:
            print(f"[INFO] Companies: {', '.join(companies_to_process)}")
        else:
            print(f"[INFO] Companies: {', '.join(companies_to_process[:10])}... and {len(companies_to_process) - 10} more")
else:
    # Single company mode
    if not is_valid_company_id(company_index_id_param):
        raise ValueError(f"Invalid company_id format: {company_index_id_param}. Must be 12 lowercase alphanumeric characters.")
    companies_to_process = [company_index_id_param]
    print(f"\n[INFO] Single company mode: {company_index_id_param}")

print(f"\n[INFO] Total companies to process: {len(companies_to_process)}")
print("="*80)

## Table Schema Definitions

Define output table schemas for standards summary and account compliance.

In [0]:
def ensure_output_tables_exist(catalog_name, company_id):
    """
    Ensure all output tables exist for a company.
    Creates tables if they don't exist, does nothing if they do.
    """
    # Standards summary table (account + region + standards breakdown)
    standards_summary_tbl = f"{catalog_name}.{company_id}.aws_standard_summary"

    spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {standards_summary_tbl} (
      company_id STRING,
      cf_processed_time TIMESTAMP,
      account_id STRING,
      region_id STRING,
      control_pass_score FLOAT,
      total_rules INT,
      total_passed INT,
      standards_summary ARRAY<STRUCT<
        std: STRING,
        score: FLOAT,
        controls: STRUCT<
          total: INT,
          passed: INT
        >,
        controls_by_severity: ARRAY<STRUCT<
          level: STRING,
          score: FLOAT,
          controls: STRUCT<
            total: INT,
            passed: INT
          >
        >>
      >>
    )
    USING DELTA
    COMMENT 'Regional standards compliance summary with per-standard and per-severity breakdowns'
    """)

    # Account compliance summary table (account-level aggregation)
    account_summary_tbl = f"{catalog_name}.{company_id}.aws_account_compliance_summary"

    spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {account_summary_tbl} (
      company_id STRING COMMENT 'データが属する企業のID',
      cf_processed_time TIMESTAMP COMMENT '集計日 (Jobの実行日時)',
      account_id STRING COMMENT 'アカウントID',
      score FLOAT COMMENT 'コントロールに基づく総合スコア (passed_rules / total_rules * 100)',
      total_rules INT COMMENT '適用コントロール総数 (passed + failed + unknown)',
      total_passed INT COMMENT '合格コントロール数'
    )
    USING DELTA
    COMMENT 'Account-level compliance summary aggregated across all regions'
    """)

    print(f"[SCHEMA] Ensured tables exist for company {company_id}")

## Main Processing Function

Load bronze data, transform to canonical format, aggregate summaries, and write results.

In [0]:
# ============================================================
# PROCESSING FUNCTION
# ============================================================

def process_company(company_id, catalog_name, window_start_ts, window_end_ts, cf_processed_time):
    """
    Process a single company: load bronze → transform → aggregate → write summaries.
    Returns tuple: (success: bool, message: str)
    """
    print("\n" + "="*80)
    print(f"PROCESSING COMPANY: {company_id}")
    print("="*80)

    try:
        # Define table names for this company
        asff_tbl = f"{catalog_name}.{company_id}.aws_securityhub_findings_1_0"
        ocsf_tbl = f"{catalog_name}.{company_id}.aws_securitylake_sh_findings_2_0"
        standards_summary_tbl = f"{catalog_name}.{company_id}.aws_standard_summary"

        print(f"[TABLE] ASFF Bronze:   {asff_tbl}")
        print(f"[TABLE] OCSF Bronze:   {ocsf_tbl}")
        print(f"[TABLE] Standards Summary:  {standards_summary_tbl}")
        print("-"*80)

        # ============================================================
        # BRONZE → IN-MEMORY: Load and Transform
        # ============================================================

        # Check table existence
        asff_exists = table_exists(asff_tbl)
        ocsf_exists = table_exists(ocsf_tbl)

        print(f"[CHECK] ASFF table exists: {asff_exists}")
        print(f"[CHECK] OCSF table exists: {ocsf_exists}")

        if not asff_exists and not ocsf_exists:
            print(f"[SKIP] Neither bronze table exists for {company_id}")
            return (False, "No bronze tables")

        sources = []

        # Load ASFF data
        if asff_exists:
            df_asff_raw = (
                spark.table(asff_tbl)
                .where(
                    (F.col("product_name") == "Security Hub") &
                    (F.col("cf_processed_time") >= window_start_ts) &
                    (F.col("cf_processed_time") < window_end_ts)
                )
            )
            asff_count = df_asff_raw.count()
            print(f"[DATA] ASFF rows in window: {asff_count:,}")
            if asff_count > 0:
                sources.append(("ASFF", df_asff_raw))

        # Load OCSF data
        if ocsf_exists:
            df_ocsf_raw = (
                spark.table(ocsf_tbl)
                .where(
                    (F.col("metadata.product.name") == "Security Hub") &
                    (F.col("cf_processed_time") >= window_start_ts) &
                    (F.col("cf_processed_time") < window_end_ts)
                )
            )
            ocsf_count = df_ocsf_raw.count()
            print(f"[DATA] OCSF rows in window: {ocsf_count:,}")
            if ocsf_count > 0:
                sources.append(("OCSF", df_ocsf_raw))

        if len(sources) == 0:
            print(f"[SKIP] No data found in 48-hour window for {company_id}")
            return (False, "No data in window")

        # Transform and union
        canonical_dfs = []
        for src, df_raw in sources:
            if src == "ASFF":
                out = transform_asff(df_raw)
            elif src == "OCSF":
                out = transform_ocsf(df_raw)
            else:
                continue

            out = out.withColumn("finding_id", normalize_finding_id(F.col("finding_id"))) \
                     .where(F.col("finding_id").isNotNull())
            canonical_dfs.append(out)

        if not canonical_dfs:
            print(f"[SKIP] No valid findings after filtering for {company_id}")
            return (False, "No valid findings")

        df_union = canonical_dfs[0]
        for d in canonical_dfs[1:]:
            df_union = df_union.unionByName(d, allowMissingColumns=True)

        union_count = df_union.count()
        print(f"[TRANSFORM] Union rows: {union_count:,}")

        # Deduplicate
        w = Window.partitionBy("finding_id").orderBy(
            F.col("finding_modified_time").desc_nulls_last(),
            F.col("_preference").desc(),
            F.col("_bronze_processed_time").desc_nulls_last()
        )

        findings = (
            df_union
            .withColumn("_rn", F.row_number().over(w))
            .where(F.col("_rn") == 1)
            .drop("_rn", "_preference", "_bronze_processed_time")
        )

        findings_count = findings.count()
        print(f"[TRANSFORM] Deduplicated findings: {findings_count:,}")

        # ============================================================
        # APPLY CORRECT SEVERITY FROM REFERENCE TABLE
        # ============================================================

        if controls_ref_df is not None:
            # Left join with reference table to get correct severity
            findings = (
                findings
                .join(
                    controls_ref_df.withColumnRenamed("severity", "ref_severity"),
                    on="control_id",
                    how="left"
                )
                .withColumn(
                    "severity",
                    # Use reference severity if available, otherwise keep original
                    F.when(F.col("ref_severity").isNotNull(), F.lower(F.col("ref_severity")))
                     .otherwise(F.lower(F.col("severity")))
                )
                .drop("ref_severity")
            )

            print(f"[SEVERITY] Applied reference table corrections (control_id -> severity)")

        else:
            print(f"[SEVERITY] Using severity from source data (no reference table)")

        # ============================================================
        # IN-MEMORY → OUTPUT: Aggregate and Write
        # ============================================================

        # Aggregate to control-level
        controls = aggregate_findings_to_controls(findings)

        active_findings = findings.where(F.upper(F.col("finding_status")) != F.lit("SUPPRESSED"))
        active_count = active_findings.count()
        suppressed_count = findings_count - active_count
        print(f"[AGGREGATE] Active findings: {active_count:,}")
        print(f"[AGGREGATE] Suppressed findings: {suppressed_count:,}")

        # Aggregate to account/region summary
        standards_summary_df = aggregate_account_region_summary(controls, company_id, cf_processed_time)

        standards_summary_count = standards_summary_df.count()
        print(f"[AGGREGATE] Standards summary rows: {standards_summary_count}")

        if standards_summary_count > 0:
            # Clear old data (1-day retention strategy)
            spark.sql(f"TRUNCATE TABLE {standards_summary_tbl}")

            # Write new data
            standards_summary_df.write.mode("append").insertInto(standards_summary_tbl)

            print("[WRITE] Standards summary table updated (TRUNCATE + Append)")

            # ============================================================
            # ACCOUNT-LEVEL COMPLIANCE SUMMARY
            # ============================================================

            # Aggregate regional data to account level
            account_summary_tbl = f"{catalog_name}.{company_id}.aws_account_compliance_summary"

            account_summary = (
                standards_summary_df
                .groupBy("company_id", "account_id")
                .agg(
                    F.sum("total_rules").cast("int").alias("total_rules"),
                    F.sum("total_passed").cast("int").alias("total_passed"),
                    F.first("cf_processed_time").alias("cf_processed_time")
                )
                .withColumn(
                    "score",
                    F.round(
                        F.when(F.col("total_rules") > 0, (F.col("total_passed") / F.col("total_rules")) * 100)
                         .otherwise(0.0),
                        7
                    )
                )
                .select(
                    "company_id",
                    "cf_processed_time",
                    "account_id",
                    "score",
                    "total_rules",
                    "total_passed"
                )
            )

            account_count = account_summary.count()
            print(f"[AGGREGATE] Account-level summary rows: {account_count}")

            if account_count > 0:
                # Create account summary table if it doesn't exist
                spark.sql(f"""
                CREATE TABLE IF NOT EXISTS {account_summary_tbl} (
                  company_id STRING COMMENT 'データが属する企業のID',
                  cf_processed_time TIMESTAMP COMMENT '集計日 (Jobの実行日時)',
                  account_id STRING COMMENT 'アカウントID',
                  score FLOAT COMMENT 'コントロールに基づく総合スコア (passed_rules / total_rules * 100)',
                  total_rules INT COMMENT '適用コントロール総数 (passed + failed + unknown)',
                  total_passed INT COMMENT '合格コントロール数'
                )
                USING DELTA
                COMMENT 'Account-level compliance summary aggregated across all regions'
                """)

                # Clear old data (1-day retention strategy)
                spark.sql(f"TRUNCATE TABLE {account_summary_tbl}")

                # Write account summary
                account_summary.write.mode("append").insertInto(account_summary_tbl)

                print(f"[WRITE] Account summary table updated: {account_summary_tbl}")

        print(f"[SUCCESS] Completed processing for {company_id}")
        return (True, "Success")

    except Exception as e:
        print(f"[ERROR] Failed to process {company_id}: {str(e)}")
        traceback.print_exc()
        return (False, str(e))

## Execute Pipeline

Process all companies sequentially and track results.

In [0]:
# ============================================================
# EXECUTE SEQUENTIAL PROCESSING
# ============================================================

# Track results
successful_companies = []
failed_companies = []
skipped_companies = []

for company_id in companies_to_process:
    # Ensure output tables exist before processing
    ensure_output_tables_exist(catalog_name, company_id)

    success, message = process_company(
        company_id,
        catalog_name,
        window_start_ts,
        window_end_ts,
        cf_processed_time
    )

    if success:
        successful_companies.append(company_id)
    elif message in ["No bronze tables", "No data in window", "No valid findings"]:
        skipped_companies.append((company_id, message))
    else:
        failed_companies.append((company_id, message))

## 9. Summary Report

Display processing results and raise exception if any failures occurred.

In [0]:
# ============================================================
# SUMMARY OUTPUT
# ============================================================

print("\n" + "="*80)
print("SECURITY HUB STANDARDS ETL PIPELINE SUMMARY")
print("="*80)
print(f"Total Companies:    {len(companies_to_process)}")
print(f"[SUCCESS]           {len(successful_companies)}")
print(f"[SKIPPED]           {len(skipped_companies)}")
print(f"[FAILED]            {len(failed_companies)}")
print("="*80)

if successful_companies:
    print(f"\n[SUCCESS] COMPLETED COMPANIES ({len(successful_companies)}):")
    for comp_id in successful_companies:
        print(f"  - {comp_id}")

if skipped_companies:
    print(f"\n[SKIPPED] COMPANIES ({len(skipped_companies)}):")
    for comp_id, reason in skipped_companies:
        print(f"  - [{comp_id}] {reason}")

if failed_companies:
    print(f"\n[FAILED] COMPANIES ({len(failed_companies)}):")
    for comp_id, error in failed_companies:
        print(f"  - [{comp_id}] {error[:150]}")
    print("\n" + "="*80)
    raise Exception(f"Pipeline failed for {len(failed_companies)} company(ies). See error details above.")

print("\n" + "="*80)
print("[SUCCESS] SECURITY HUB STANDARDS ETL COMPLETED SUCCESSFULLY")
print("="*80)