In [0]:
# Create schema (bronze) if not exists
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

spark.sql("SELECT current_catalog(), current_database()").show()

In [0]:
%sql
CREATE VOLUME if not exists workspace.default.output 

In [0]:
from pyspark.sql.functions import col, array, lit, size, collect_set, concat_ws, nullif, when

clients = spark.table("silver.clients_enriched")
risk_events = spark.table("silver.risk_events")
model_outputs = spark.table("silver.model_outputs")

# Combine rule-based flags
risk_summary = (
    risk_events
    .groupBy("client_id")
    .agg(concat_ws("", collect_set("trigger_reason_1")).alias("rule_triggers_1"),
        concat_ws("", collect_set("trigger_reason_2")).alias("rule_triggers_2"),
        concat_ws("", collect_set("trigger_reason_3")).alias("rule_triggers_3"),
        concat_ws("", collect_set("trigger_reason_4")).alias("rule_triggers_4"))
)

risk_summary=risk_summary.select([when(col(c)=="", None).otherwise(col(c)).alias(c) for c in risk_summary.columns])
risk_summary = risk_summary.withColumn("triggered_risk_events", concat_ws(", ", col("rule_triggers_1"), col("rule_triggers_2"), col("rule_triggers_3"), col("rule_triggers_4")))

In [0]:
# Merge all
final_risk_df = (
    clients
    .join(risk_summary, on="client_id", how="left")
    .join(model_outputs.select("client_id", "output_value"), on="client_id", how="left")
    .withColumn("flagged", (col("output_value") == 1) | col("triggered_risk_events").isNotNull())
)

# Write final output
final_risk_df.write.mode("overwrite").option("header", "true").format("csv").save("/Volumes/workspace/default/output/client_risk_scores")

# Optional: save as table
final_risk_df.write.mode("overwrite").saveAsTable("gold.client_risk_scores")