In [0]:
%sql
USE CATALOG neon_gtm;
USE SCHEMA bronze;

In [0]:
%sql
SELECT current_catalog(), current_schema();

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
from pyspark.sql import functions as F
from pyspark.sql.functions import expr, lit, to_timestamp, create_map

# ----------------------------------------
# 1. Load metadata from control tables
# ----------------------------------------

source_name = "telecomx_sensor_csv"

config = spark.table("neon_gtm.bronze.data_source_config") \
    .filter(F.col("source_name") == source_name) \
    .first()

schema_df = spark.table("neon_gtm.bronze.source_schema_definition") \
    .filter(F.col("source_config_id") == config.source_config_id)

mapping_df = spark.table("neon_gtm.bronze.source_column_mapping") \
    .filter(F.col("schema_id").isin([r.schema_id for r in schema_df.collect()]))

rules_df = spark.table("neon_gtm.bronze.data_quality_rule") \
    .filter(F.col("schema_id").isin([r.schema_id for r in schema_df.collect()]))

# ----------------------------------------
# 2. Build schema dynamically
# ----------------------------------------

type_map = {
    "STRING": StringType(),
    "FLOAT": FloatType(),
    "TIMESTAMP": TimestampType()
}

# Sort schema_df by ordinal_position before building schema
ordered_schema_df = schema_df.orderBy("ordinal_position")

fields = [
    StructField(row.column_name, type_map.get(row.data_type.upper(), StringType()), row.nullable)
    for row in ordered_schema_df.collect()
]

schema = StructType(fields)

mapping_df = (
    spark.table("neon_gtm.bronze.source_column_mapping")
    .alias("map")
    .join(
        spark.table("neon_gtm.bronze.source_schema_definition").alias("schema"),
        F.col("map.schema_id") == F.col("schema.schema_id")
    )
    .filter(F.col("schema.source_config_id") == config.source_config_id)
)
# fetch targtet table name
target_table = (
    mapping_df.select("target_table_name")
    .distinct()
    .first()[0]
)

In [0]:
# ----------------------------------------
# 3. Read source data using Auto Loader
# ----------------------------------------
df_raw = (
    spark.read
    .format(config.source_type)
    .option("header", config.connection_details["header"])
    .schema(schema)
    .load(config.connection_details["path"])
)
df_raw.display()
# ----------------------------------------
# 4. Apply transformation hints
# ----------------------------------------

for row in schema_df.collect():
    if row.transformation_hint:
        df_raw = df_raw.withColumn(row.column_name, expr(row.transformation_hint))

# ----------------------------------------
# 5. Apply column mappings
# ----------------------------------------

for row in mapping_df.collect():
    df_raw = df_raw.withColumnRenamed(row.source_column_name, row.target_column_name)

# ----------------------------------------
# 6. Apply validation rules
# ----------------------------------------

for rule in rules_df.collect():
    df_raw = df_raw.filter(rule.validation_expression)

# ----------------------------------------
# 7. Enrich with static or metadata-driven values
# ----------------------------------------

# Load sensor metadata
sensor_df = spark.table("neon_gtm.bronze.sensor").select("equipment_id", "sensor_id", "unit_id")

# Load unit metadata
unit_df = spark.table("neon_gtm.bronze.measurement_unit").select("unit_id", "unit_type")

# Join sensor metadata
df_enriched = df_raw \
    .withColumn("timestamp", to_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss")) \
    .withColumnRenamed("device_id", "equipment_id") \
    .join(sensor_df, on="equipment_id", how="left") \
    .join(unit_df, on="unit_id", how="left")

# Final transformation
df_transformed = df_enriched \
    .withColumnRenamed("power", "metric_value") \
    .withColumn("metric_name", F.col("unit_type")) \
    .withColumn("reading_id", F.expr("uuid()")) \
    .withColumn("metadata", create_map(F.lit("source"), F.lit(source_name))) \
    .drop("unit_type")
# ----------------------------------------
# 8. Write to bronze.sensor_reading
# ----------------------------------------
target_schema = spark.table(f"neon_gtm.bronze.{target_table}").schema
not_null_cols = [f.name for f in target_schema.fields if not f.nullable]

from pyspark.sql.functions import when, col

# Build composite condition: all NOT NULL columns must be non-null
valid_condition = expr(" AND ".join([f"{c} IS NOT NULL" for c in not_null_cols]))

# Apply quality_flag logic
df_validated = df_transformed.withColumn(
    "quality_flag",
    when(valid_condition, lit("Valid")).otherwise(lit("Invalid"))
)

df_validated.filter("quality_flag = 'Invalid'").display()
df_validated.filter("quality_flag = 'Valid'").write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"neon_gtm.bronze.{target_table}")