# UFC Events Transform & Clean (Bronze -> Silver)

Cleans and writes `espn_events_silver`.


In [None]:
from datetime import datetime, timezone
from pyspark.sql import functions as F
from pyspark.sql.window import Window

try:
    dbutils.widgets.text("storage_account", "storagetmufc")
    dbutils.widgets.text("secret_scope", "kv-scope")
    dbutils.widgets.text("key_name", "adls-account-key")
    dbutils.widgets.text("bronze_db", "ufc_bronze")
    dbutils.widgets.text("silver_db", "ufc_silver")
except Exception:
    pass

storage_account = dbutils.widgets.get("storage_account") if 'dbutils' in globals() else None
secret_scope = dbutils.widgets.get("secret_scope") if 'dbutils' in globals() else None
key_name = dbutils.widgets.get("key_name") if 'dbutils' in globals() else None
bronze_db = dbutils.widgets.get("bronze_db") if 'dbutils' in globals() else "ufc_bronze"
silver_db = dbutils.widgets.get("silver_db") if 'dbutils' in globals() else "ufc_silver"

try:
    account_key = dbutils.secrets.get(secret_scope, key_name)
    spark.conf.set(f"fs.azure.account.key.{storage_account}.dfs.core.windows.net", account_key)
except Exception:
    pass

try:
    spark.sql("USE CATALOG hive_metastore")
except Exception:
    try:
        spark.catalog.setCurrentCatalog("hive_metastore")
    except Exception:
        pass
spark.sql(f"CREATE DATABASE IF NOT EXISTS {bronze_db}")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")
try:
    spark.catalog.setCurrentDatabase(bronze_db)
except Exception:
    spark.sql(f"USE DATABASE {bronze_db}")
print("Bronze DB:", bronze_db, "| Silver DB:", silver_db)



In [None]:
_events = spark.table(f"hive_metastore.{bronze_db}.espn_events")
print("Events rows:", _events.count())
display(_events.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in _events.columns]))


In [None]:
# Preview BEFORE cleaning (events)
display(_events.orderBy(F.desc("event_date")).limit(20))


In [None]:
SPARSE_THRESHOLD = 0.98
MANDATORY_EVENTS = {"event_id","event_date","event_name"} 

w = Window.partitionBy("event_id").orderBy(F.desc("ingestion_date"), F.desc("run_id"))
e1 = (_events
    .withColumn("event_name", F.trim("event_name"))
    .withColumn("venue", F.trim("venue"))
    .withColumn("country", F.trim("country"))
    .withColumn("rn", F.row_number().over(w)).filter("rn=1").drop("rn"))

rows_cnt = e1.count()
null_stats = [(c, (e1.filter(F.col(c).isNull()).count() if rows_cnt else 0) / rows_cnt if rows_cnt else 0.0) for c in e1.columns]
null_map = {c: r for c, r in null_stats}
print("Null ratios:", {k: round(v,3) for k,v in null_map.items()})

candidate_cols = ["venue","country","status","num_fights"]
keep_cols = [c for c in candidate_cols if null_map.get(c, 0.0) < SPARSE_THRESHOLD]

select_cols = [
    "event_id",
    "event_date",
    F.year("event_date").alias("event_year"),
    "event_name",
]
for c in keep_cols:
    select_cols.append(c)

out = e1.select(*select_cols)

spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")
out.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(f"hive_metastore.{silver_db}.espn_events_silver")
print("Events silver written; kept columns:", out.columns)



In [None]:
# Preview AFTER cleaning (events)
display(spark.table(f"hive_metastore.{silver_db}.espn_events_silver").orderBy(F.desc("event_date")).limit(20))
