# UFC Fights Transform & Clean (Bronze -> Silver)

Cleans and writes `espn_fights_silver`.


In [None]:
from datetime import datetime, timezone
from pyspark.sql import functions as F
from pyspark.sql.window import Window

try:
    dbutils.widgets.text("storage_account", "storagetmufc")
    dbutils.widgets.text("secret_scope", "kv-scope")
    dbutils.widgets.text("key_name", "adls-account-key")
    dbutils.widgets.text("bronze_db", "ufc_bronze")
    dbutils.widgets.text("silver_db", "ufc_silver")
except Exception:
    pass

storage_account = dbutils.widgets.get("storage_account") if 'dbutils' in globals() else None
secret_scope = dbutils.widgets.get("secret_scope") if 'dbutils' in globals() else None
key_name = dbutils.widgets.get("key_name") if 'dbutils' in globals() else None
bronze_db = dbutils.widgets.get("bronze_db") if 'dbutils' in globals() else "ufc_bronze"
silver_db = dbutils.widgets.get("silver_db") if 'dbutils' in globals() else "ufc_silver"

try:
    account_key = dbutils.secrets.get(secret_scope, key_name)
    spark.conf.set(f"fs.azure.account.key.{storage_account}.dfs.core.windows.net", account_key)
except Exception:
    pass

try:
    spark.sql("USE CATALOG hive_metastore")
except Exception:
    try:
        spark.catalog.setCurrentCatalog("hive_metastore")
    except Exception:
        pass
spark.sql(f"CREATE DATABASE IF NOT EXISTS {bronze_db}")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")
try:
    spark.catalog.setCurrentDatabase(bronze_db)
except Exception:
    spark.sql(f"USE DATABASE {bronze_db}")
print("Bronze DB:", bronze_db, "| Silver DB:", silver_db)



In [None]:
_fights = spark.table(f"hive_metastore.{bronze_db}.espn_fights").filter(F.col("event_year").isNotNull())
print("Fights rows (event_year != null):", _fights.count())
display(_fights.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in _fights.columns]))


In [None]:
# Preview BEFORE cleaning (fights)
display(_fights.orderBy(F.desc("event_date"), F.asc("card_order")).limit(30))


In [None]:
# Clean fights and write silver
SPARSE_THRESHOLD = 0.98
# Guard for missing metadata in bronze
cols = set(_fights.columns)
has_ing = "ingestion_date" in cols
has_run = "run_id" in cols

order_exprs = []
if has_ing:
    order_exprs.append(F.col("ingestion_date").desc())
if has_run:
    order_exprs.append(F.col("run_id").desc())
if not order_exprs and "event_date" in cols:
    order_exprs.append(F.col("event_date").desc())
if not order_exprs:
    order_exprs.append(F.lit(0))

w = Window.partitionBy("event_id", "competition_id").orderBy(*order_exprs)

f1 = (_fights
    .withColumn("weight_class", F.initcap(F.trim("weight_class")))
    .withColumn("card_order", F.col("card_order").cast("int"))
    .withColumn("fighter_a_name", F.trim("fighter_a_name"))
    .withColumn("fighter_b_name", F.trim("fighter_b_name"))
    .withColumn("rn", F.row_number().over(w)).filter("rn=1").drop("rn"))

# drop sparse columns
counts = _fights.count()
nulls = [(c, f1.filter(F.col(c).isNull()).count()) for c in f1.columns]
drop_cols = [c for c, n in nulls if counts and n / counts >= SPARSE_THRESHOLD]
print("Drop cols:", drop_cols)

sel_cols = [
    "competition_id","event_id","event_date",F.year("event_date").alias("event_year"),
    "weight_class","card_order","status",
    "fighter_a_name","fighter_a_winner","fighter_b_name","fighter_b_winner",
]
if has_ing:
    sel_cols.append("ingestion_date")
if has_run:
    sel_cols.append("run_id")

out = f1.drop(*drop_cols).select(*sel_cols)

spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")

table_name = f"hive_metastore.{silver_db}.espn_fights_silver"
exists = False
try:
    spark.table(table_name)
    exists = True
except Exception:
    exists = False

if not exists:
    out.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(table_name)
else:
    if has_ing and has_run:
        tgt = spark.table(table_name).select("event_id", "competition_id", "ingestion_date", "run_id").alias("t")
        incr = (out.alias("s")
            .join(tgt, on=["event_id","competition_id"], how="left")
            .where(F.col("t.event_id").isNull() | (F.col("s.ingestion_date") > F.col("t.ingestion_date")) | ((F.col("s.ingestion_date") == F.col("t.ingestion_date")) & (F.col("s.run_id") > F.col("t.run_id"))))
            .select("s.*")
        )
        incr.createOrReplaceTempView("src_fights")
        spark.sql(f"""
            MERGE INTO {table_name} t
            USING src_fights s
            ON t.event_id = s.event_id AND t.competition_id = s.competition_id
            WHEN MATCHED AND (t.ingestion_date < s.ingestion_date OR (t.ingestion_date = s.ingestion_date AND t.run_id < s.run_id)) THEN UPDATE SET *
            WHEN NOT MATCHED THEN INSERT *
        """)
    else:
        # No metadata available → insert only new keys
        tgt_keys = spark.table(table_name).select("event_id","competition_id").dropDuplicates()
        incr = out.alias("s").join(tgt_keys.alias("t"), on=["event_id","competition_id"], how="left_anti")
        if incr.limit(1).count() > 0:
            incr.createOrReplaceTempView("src_fights")
            spark.sql(f"""
                MERGE INTO {table_name} t
                USING src_fights s
                ON t.event_id = s.event_id AND t.competition_id = s.competition_id
                WHEN NOT MATCHED THEN INSERT *
            """)
print("Fights silver upserted")


In [None]:
# Preview AFTER cleaning (fights)
display(spark.table(f"hive_metastore.{silver_db}.espn_fights_silver").orderBy(F.desc("event_date"), F.asc("card_order")).limit(30))
