# GOLD: Height/Reach Advantage Impact

Analyzes correlation between size advantage (height, reach) and win rate. Builds materialized bins and optional correlation outputs.


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Bootstrap
try:
    dbutils.widgets.text("storage_account", "storagetmufc")
    dbutils.widgets.text("secret_scope", "kv-scope")
    dbutils.widgets.text("key_name", "adls-account-key")
    dbutils.widgets.text("silver_db", "ufc_silver")
    dbutils.widgets.text("gold_db", "ufc_gold")
except Exception:
    pass

silver_db = dbutils.widgets.get("silver_db") if 'dbutils' in globals() else "ufc_silver"
gold_db = dbutils.widgets.get("gold_db") if 'dbutils' in globals() else "ufc_gold"

try:
    storage_account = dbutils.widgets.get("storage_account")
    secret_scope = dbutils.widgets.get("secret_scope")
    key_name = dbutils.widgets.get("key_name")
    account_key = dbutils.secrets.get(secret_scope, key_name)
    spark.conf.set(f"fs.azure.account.key.{storage_account}.dfs.core.windows.net", account_key)
except Exception:
    pass

try:
    spark.sql("USE CATALOG hive_metastore")
except Exception:
    try:
        spark.catalog.setCurrentCatalog("hive_metastore")
    except Exception:
        pass
spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {gold_db}")
print("Silver DB:", silver_db, "| Gold DB:", gold_db)


In [None]:
# Load silver
fights = spark.table(f"hive_metastore.{silver_db}.espn_fights_silver")
ath = spark.table(f"hive_metastore.{silver_db}.espn_athletes_silver")

# Two rows per fight
left = (fights
    .select("competition_id","event_id","event_date","event_year","weight_class",
            F.col("fighter_a_name").alias("athlete_name"), F.col("fighter_a_winner").alias("is_winner"))
    .withColumn("side", F.lit("A")))
right = (fights
    .select("competition_id","event_id","event_date","event_year","weight_class",
            F.col("fighter_b_name").alias("athlete_name"), F.col("fighter_b_winner").alias("is_winner"))
    .withColumn("side", F.lit("B")))
rows = left.unionByName(right)

# Map name -> athlete_id
names = (ath.select(F.col("full_name").alias("n1"), F.col("display_name").alias("n2"), "athlete_id"))
rows = (rows.join(names, (F.lower("athlete_name") == F.lower("n1")) | (F.lower("athlete_name") == F.lower("n2")), "left"))

feat = (rows.alias("x")
    .join(ath.alias("ax"), F.col("x.athlete_id") == F.col("ax.athlete_id"), "left")
    .join(rows.alias("y"), (F.col("x.competition_id") == F.col("y.competition_id")) & (F.col("x.side") != F.col("y.side")), "inner")
    .join(ath.alias("ay"), F.col("y.athlete_id") == F.col("ay.athlete_id"), "left")
    .select(
        F.col("x.competition_id").alias("competition_id"),
        F.col("x.event_id").alias("event_id"),
        F.col("x.event_date").alias("event_date"),
        F.col("x.event_year").alias("event_year"),
        F.col("x.weight_class").alias("weight_class"),
        F.col("x.is_winner").alias("is_winner"),
        (F.col("ax.height_cm") - F.col("ay.height_cm")).alias("height_adv_cm"),
        (F.col("ax.reach_cm") - F.col("ay.reach_cm")).alias("reach_adv_cm"),
        (F.months_between(F.col("x.event_date"), F.col("ax.birth_date"))/12.0 - F.months_between(F.col("x.event_date"), F.col("ay.birth_date"))/12.0).alias("age_adv_years"),
    )
    .filter(F.col("height_adv_cm").isNotNull() | F.col("reach_adv_cm").isNotNull())
)

# Clip extremes and bin
feat = feat.withColumn("height_adv_cm", F.when(F.col("height_adv_cm") > 25, 25).when(F.col("height_adv_cm") < -25, -25).otherwise(F.col("height_adv_cm")))
feat = feat.withColumn("reach_adv_cm", F.when(F.col("reach_adv_cm") > 30, 30).when(F.col("reach_adv_cm") < -30, -30).otherwise(F.col("reach_adv_cm")))

bins_h = [-25,-20,-15,-10,-5,0,5,10,15,20,25]
bins_r = [-30,-20,-10,0,10,20,30]

# helper for bin label
def _bin_label(value, edges):
    if value is None:
        return None
    for i in range(len(edges)-1):
        if value >= edges[i] and value < edges[i+1]:
            return f"[{edges[i]},{edges[i+1]})"
    if value == edges[-1]:
        return f"[{edges[-2]},{edges[-1]}]"
    return None

from pyspark.sql.types import StringType
bin_h_udf = F.udf(lambda v: _bin_label(v, bins_h), StringType())
bin_r_udf = F.udf(lambda v: _bin_label(v, bins_r), StringType())

binned = (feat
    .withColumn("height_adv_bin", bin_h_udf(F.col("height_adv_cm")))
    .withColumn("reach_adv_bin", bin_r_udf(F.col("reach_adv_cm")))
    .filter(F.col("height_adv_bin").isNotNull())
)

# Aggregations
adv_overall = (binned
    .groupBy("height_adv_bin")
    .agg(F.count("*").alias("fights"), F.sum(F.when(F.col("is_winner") == True, 1).otherwise(0)).alias("wins"))
    .withColumn("win_rate", F.col("wins")/F.col("fights"))
)

# Removed per request: no aggregation by weight class

heat_height_reach = (binned
    .groupBy("height_adv_bin","reach_adv_bin")
    .agg(F.count("*").alias("fights"), F.sum(F.when(F.col("is_winner") == True, 1).otherwise(0)).alias("wins"))
    .withColumn("win_rate", F.col("wins")/F.col("fights"))
)

# Materialize (no by-weight aggregation)
adv_overall.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(f"hive_metastore.{gold_db}.mv_height_advantage_bins_overall")
heat_height_reach.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(f"hive_metastore.{gold_db}.mv_height_vs_reach_heatmap")

spark.sql(f"CREATE OR REPLACE VIEW hive_metastore.{gold_db}.v_height_advantage_bins_overall AS SELECT * FROM hive_metastore.{gold_db}.mv_height_advantage_bins_overall")
spark.sql(f"CREATE OR REPLACE VIEW hive_metastore.{gold_db}.v_height_vs_reach_heatmap AS SELECT * FROM hive_metastore.{gold_db}.mv_height_vs_reach_heatmap")

# Quick preview
display(spark.table(f"hive_metastore.{gold_db}.mv_height_advantage_bins_overall").orderBy("height_adv_bin"))
