# UFC Athletes Transform & Clean (Bronze -> Silver)

Cleans and writes `espn_athletes_silver` from bronze `espn_athletes`. Shows BEFORE/AFTER previews and drops near-empty columns.


In [None]:
from datetime import datetime, timezone
from pyspark.sql import functions as F
from pyspark.sql.window import Window

try:
    dbutils.widgets.text("storage_account", "storagetmufc")
    dbutils.widgets.text("secret_scope", "kv-scope")
    dbutils.widgets.text("key_name", "adls-account-key")
    dbutils.widgets.text("bronze_db", "ufc_bronze")
    dbutils.widgets.text("silver_db", "ufc_silver")
except Exception:
    pass

bronze_db = dbutils.widgets.get("bronze_db") if 'dbutils' in globals() else "ufc_bronze"
silver_db = dbutils.widgets.get("silver_db") if 'dbutils' in globals() else "ufc_silver"

try:
    storage_account = dbutils.widgets.get("storage_account")
    secret_scope = dbutils.widgets.get("secret_scope")
    key_name = dbutils.widgets.get("key_name")
    account_key = dbutils.secrets.get(secret_scope, key_name)
    spark.conf.set(f"fs.azure.account.key.{storage_account}.dfs.core.windows.net", account_key)
except Exception:
    pass

try:
    spark.sql("USE CATALOG hive_metastore")
except Exception:
    try:
        spark.catalog.setCurrentCatalog("hive_metastore")
    except Exception:
        pass
spark.sql(f"CREATE DATABASE IF NOT EXISTS {bronze_db}")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")
try:
    spark.catalog.setCurrentDatabase(bronze_db)
except Exception:
    spark.sql(f"USE DATABASE {bronze_db}")
print("Bronze DB:", bronze_db, "| Silver DB:", silver_db)


In [None]:
# BEFORE preview
_ath = spark.table(f"hive_metastore.{bronze_db}.espn_athletes")
print("Athletes rows:", _ath.count())
display(_ath.orderBy(F.col("full_name")).limit(20))


In [None]:
# Clean & write silver
SPARSE_THRESHOLD = 0.98
w = Window.partitionBy("athlete_id").orderBy(F.desc("ingestion_date"), F.desc("run_id"))

ath1 = (_ath
    .withColumn("full_name", F.trim("full_name"))
    .withColumn("display_name", F.trim("display_name"))
    .withColumn("country", F.trim("country"))
    .withColumn("stance", F.trim("stance"))
    .withColumn("weight_class", F.trim("weight_class"))
    .withColumn("team", F.trim("team"))
    .withColumn("combat_style", F.trim("combat_style"))
    .withColumn("rn", F.row_number().over(w)).filter("rn=1").drop("rn")
)

rows_cnt = ath1.count()
null_map = {c: (ath1.filter(F.col(c).isNull()).count()/rows_cnt if rows_cnt else 0.0) for c in ath1.columns}
core_keep = {"athlete_id","full_name","display_name","birth_date"}
optional = ["country","height","height_cm","reach_cm","stance","weight_class","team","combat_style","image_url"]
keep_cols = list(core_keep) + [c for c in optional if null_map.get(c,0.0) < SPARSE_THRESHOLD]

out = ath1.select(*keep_cols + ["ingestion_date","run_id"])  # include technical columns for incremental

spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")

table_name = f"hive_metastore.{silver_db}.espn_athletes_silver"
exists = False
try:
    spark.table(table_name)
    exists = True
except Exception:
    exists = False

if not exists:
    out.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(table_name)
else:
    tgt = spark.table(table_name).select("athlete_id", "ingestion_date", "run_id").alias("t")
    incr = (out.alias("s")
        .join(tgt, on="athlete_id", how="left")
        .where(F.col("t.athlete_id").isNull() | (F.col("s.ingestion_date") > F.col("t.ingestion_date")) | ((F.col("s.ingestion_date") == F.col("t.ingestion_date")) & (F.col("s.run_id") > F.col("t.run_id"))))
        .select("s.*")
    )
    incr.createOrReplaceTempView("src_athletes")
    spark.sql(f"""
        MERGE INTO {table_name} t
        USING src_athletes s
        ON t.athlete_id = s.athlete_id
        WHEN MATCHED AND (t.ingestion_date < s.ingestion_date OR (t.ingestion_date = s.ingestion_date AND t.run_id < s.run_id)) THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """)
print("Athletes silver upserted; kept columns:", out.columns)


In [None]:
# AFTER preview
_display = spark.table(f"hive_metastore.{silver_db}.espn_athletes_silver").orderBy(F.col("full_name"))
display(_display.limit(20))
