# GOLD: Style Matchup Matrix (Materialized)

Aggregates fighter vs opponent combat styles across all weight classes; includes heatmap visualization.


In [None]:
from pyspark.sql import functions as F

# Bootstrap
try:
    dbutils.widgets.text("storage_account", "storagetmufc")
    dbutils.widgets.text("secret_scope", "kv-scope")
    dbutils.widgets.text("key_name", "adls-account-key")
    dbutils.widgets.text("silver_db", "ufc_silver")
    dbutils.widgets.text("gold_db", "ufc_gold")
except Exception:
    pass

silver_db = dbutils.widgets.get("silver_db") if 'dbutils' in globals() else "ufc_silver"
gold_db = dbutils.widgets.get("gold_db") if 'dbutils' in globals() else "ufc_gold"

try:
    storage_account = dbutils.widgets.get("storage_account")
    secret_scope = dbutils.widgets.get("secret_scope")
    key_name = dbutils.widgets.get("key_name")
    account_key = dbutils.secrets.get(secret_scope, key_name)
    spark.conf.set(f"fs.azure.account.key.{storage_account}.dfs.core.windows.net", account_key)
except Exception:
    pass

try:
    spark.sql("USE CATALOG hive_metastore")
except Exception:
    try:
        spark.catalog.setCurrentCatalog("hive_metastore")
    except Exception:
        pass
spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {gold_db}")
print("Silver DB:", silver_db, "| Gold DB:", gold_db)


In [None]:
# Load silver
af = spark.table(f"hive_metastore.{silver_db}.espn_fights_silver")
ath = spark.table(f"hive_metastore.{silver_db}.espn_athletes_silver")

# Normalize combat style into 4 buckets; drop explicit '0'
from pyspark.sql.types import StringType

def _style_bucket(s: str) -> str:
    if s is None or not str(s).strip():
        base = "mixed martial arts"
    else:
        base = str(s).strip().lower()
    # unify separators to '/'
    for sep in [',', ';', '&', '|', '\\', ' and ', ' vs ', '/']:
        base = base.replace(sep, '/')
    token = base.split('/')[0].strip()
    token = token.replace('-', '_').replace(' ', '_')
    # normalize jiu jitsu variants
    if 'brazilian_jiu' in token or 'jiu_jitsu' in token:
        token = 'jiu_jitsu'
    if token == '0':
        return None
    mapping = {
        # Wrestler
        'grappling': 'wrestler', 'wrestling': 'wrestler', 'thugjistu': 'wrestler',
        'sambo': 'wrestler', 'judo': 'wrestler', 'jiu_jitsu': 'wrestler',
        # Striker
        'sanda': 'striker', 'muay_thai': 'striker', 'kung_fu': 'striker',
        'teakwondo': 'striker', 'taekwondo': 'striker', 'ninja': 'striker',
        'nikidokai': 'striker', 'boxing': 'striker', 'kickboxing': 'striker', 'karate': 'striker',
        # Street fighting
        'street_fighting': 'street_fighting', 'brawler': 'street_fighting',
        # MMA
        'mixed_martial_artist': 'mma', 'mixed_martial_arts': 'mma', 'freestyle': 'mma',
    }
    return mapping.get(token, 'mma')

style_bucket_udf = F.udf(_style_bucket, StringType())
ath1 = (ath
    .withColumn("style_bucket", style_bucket_udf(F.col("combat_style")))
    .filter(F.col("style_bucket").isNotNull())
    .select("athlete_id","style_bucket"))

# Build two-rows-per-fight with best-effort name to id mapping
left = (af
    .select("competition_id","event_id","event_date",
            F.col("fighter_a_name").alias("athlete_name"), F.col("fighter_a_winner").alias("is_winner"))
    .withColumn("side", F.lit("A")))
right = (af
    .select("competition_id","event_id","event_date",
            F.col("fighter_b_name").alias("athlete_name"), F.col("fighter_b_winner").alias("is_winner"))
    .withColumn("side", F.lit("B")))

rows = left.unionByName(right)

names = (ath.select(F.col("full_name").alias("n1"), F.col("display_name").alias("n2"), "athlete_id"))
rowsj = (rows
    .join(names, (F.lower("athlete_name") == F.lower("n1")) | (F.lower("athlete_name") == F.lower("n2")), "left")
    .join(ath1, "athlete_id", "left"))

with_op = (rowsj.alias("x")
    .join(rowsj.alias("y"), (F.col("x.competition_id")==F.col("y.competition_id")) & (F.col("x.side")!=F.col("y.side")), "inner")
    .select(
        F.col("x.competition_id").alias("competition_id"),
        F.col("x.event_id").alias("event_id"),
        F.col("x.event_date").alias("event_date"),
        F.col("x.athlete_id").alias("athlete_id"),
        F.col("x.style_bucket").alias("style"),
        F.col("y.style_bucket").alias("opp_style"),
        F.col("x.is_winner").alias("is_winner")
    ))

agg = (with_op
    .groupBy("style","opp_style")
    .agg(F.count("*").alias("fights"), F.sum(F.when(F.col("is_winner") == True, 1).otherwise(0)).alias("wins"))
    .withColumn("win_rate", F.when(F.col("fights")>0, F.col("wins")/F.col("fights")).otherwise(F.lit(0.0)))
)

# Filter out zero/null rows and columns
agg_filtered = agg.filter((F.col("fights") > 0) & F.col("win_rate").isNotNull())

# Materialize
spark.sql(f"CREATE DATABASE IF NOT EXISTS {gold_db}")
agg_filtered.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable(f"hive_metastore.{gold_db}.mv_style_matchups_overall")
spark.sql(f"CREATE OR REPLACE VIEW hive_metastore.{gold_db}.v_style_matchups_overall AS SELECT * FROM hive_metastore.{gold_db}.mv_style_matchups_overall")

# Heatmap (build pivot only from filtered data)
pivot = (agg_filtered.groupBy("style").pivot("opp_style").agg(F.first("win_rate")))
display(pivot)


In [None]:
# Matplotlib/Seaborn heatmap
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pdf = pivot.toPandas().fillna(0)
if 'style' in pdf.columns:
    pdf = pdf.set_index('style')

plt.figure(figsize=(min(18, 2 + 1.2*len(pdf.columns)), min(14, 2 + 0.8*len(pdf))))
sns.heatmap(pdf, cmap="YlOrRd", vmin=0.0, vmax=1.0, annot=True, fmt=".2f")
plt.title("Win rate (style vs opponent style)")
plt.xlabel("Opponent style")
plt.ylabel("Style")
plt.tight_layout()
plt.show()
