In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import json

In [None]:
# Configuration
database_name = "cloudfastener"
schema_name = "reference"
table_name = "securityhub_controls"
full_table_name = f"{database_name}.{schema_name}.{table_name}"

# For Workspace files, use relative path (same directory as notebook)
json_file_path = "securityhub_controls.json"

print(f"Target table: {full_table_name}")
print(f"Source file: {json_file_path}")
print("NOTE: JSON file should be in the same Workspace directory as this notebook")

In [None]:
# Ensure database and schema exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {database_name}.{schema_name}")

print(f"✓ Database and schema verified: {database_name}.{schema_name}")

In [None]:
# Define table schema
schema = StructType([
    StructField("control_id", StringType(), nullable=False),
    StructField("severity", StringType(), nullable=False)
])

print("✓ Schema defined:")
print(schema)

In [None]:
# Read JSON file from Workspace - use Python to read, then create DataFrame
import json as json_lib

# Read JSON file using Python (works for Workspace files)
with open(json_file_path, 'r') as f:
    json_data = json_lib.load(f)

# Create DataFrame from JSON data
df = spark.createDataFrame(json_data, schema=schema)

# Show sample data
print(f"✓ Loaded {df.count()} control records")
print("\nSample data:")
df.show(10, truncate=False)

# Data quality checks
print("\nData Quality Checks:")
print(f"- Total records: {df.count()}")
print(f"- Distinct control_ids: {df.select('control_id').distinct().count()}")
print(f"- Null control_ids: {df.filter(df.control_id.isNull()).count()}")
print(f"- Null severities: {df.filter(df.severity.isNull()).count()}")

# Show severity distribution
print("\nSeverity Distribution:")
df.groupBy("severity").count().orderBy("severity").show()

In [None]:
# Drop table if exists (for clean rebuild)
spark.sql(f"DROP TABLE IF EXISTS {full_table_name}")
print(f"✓ Dropped existing table (if any): {full_table_name}")

In [None]:
# Write DataFrame to Delta table
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(full_table_name)

print(f"✓ Table created successfully: {full_table_name}")

In [None]:
# Add table comments for documentation
spark.sql(f"""
    COMMENT ON TABLE {full_table_name}
    IS 'Reference table for AWS Security Hub control ID to severity mappings.
    Used in bronze_to_gold pipeline for correct severity attribution.'
""")

spark.sql(f"ALTER TABLE {full_table_name} CHANGE COLUMN control_id COMMENT 'Security Hub control identifier (e.g., ACM.1, S3.1)'")
spark.sql(f"ALTER TABLE {full_table_name} CHANGE COLUMN severity COMMENT 'Control severity level: LOW, MEDIUM, HIGH, or CRITICAL'")

print("✓ Table comments added")

In [None]:
# Verify table creation and contents
print(f"\n{'='*80}")
print("TABLE VERIFICATION")
print(f"{'='*80}\n")

# Show table properties
print("Table Description:")
spark.sql(f"DESCRIBE EXTENDED {full_table_name}").show(truncate=False)

# Count records
record_count = spark.table(full_table_name).count()
print(f"\n✓ Total records in table: {record_count}")

# Sample records
print("\nSample records from table:")
spark.table(full_table_name).show(20, truncate=False)

# Show some specific examples
print("\nExample lookups:")
spark.sql(f"""
    SELECT control_id, severity
    FROM {full_table_name}
    WHERE control_id IN ('ACM.1', 'S3.1', 'EC2.1', 'IAM.1')
    ORDER BY control_id
""").show(truncate=False)

In [None]:
# Summary report
print(f"\n{'='*80}")
print("SUMMARY")
print(f"{'='*80}\n")
print(f"✓ Table Name: {full_table_name}")
print(f"✓ Record Count: {record_count}")
print(f"✓ Columns: control_id (STRING), severity (STRING)")
print(f"✓ Table Format: Delta")
print(f"\n✓ Table is ready for use in bronze_to_gold_v2 pipeline!")
print(f"\nUsage example in your pipeline:")
print(f"""
# Join with controls reference table to get correct severity
controls_ref = spark.table("{full_table_name}")
df_with_severity = df.join(
    controls_ref,
    df.control_id == controls_ref.control_id,
    "left"
).select(
    df.*,
    controls_ref.severity.alias("correct_severity")
)
""")