In [7]:
import pandas as pd
import numpy as np
import re
from difflib import get_close_matches

# =========================
# Load
# =========================
df = pd.read_csv("kbo_2026_prediction_dataset_adjusted.csv")

# =========================
# Basic cleaning of raw team strings
# =========================
df["team_raw"] = (
    df["team"]
    .astype(str)
    .str.replace("\u200b", "", regex=False)
    .str.replace("\xa0", " ", regex=False)
    .str.strip()
)

# =========================
# Canonical KBO team names
# =========================
CANON = [
    "Kiwoom Heroes", "SSG Landers", "LG Twins", "KIA Tigers", "KT Wiz",
    "NC Dinos", "Doosan Bears", "Lotte Giants", "Samsung Lions", "Hanwha Eagles"
]

# Keyword rules (Korean + common English)
# Add more tokens if your raw file contains weird variants.
TEAM_RULES = [
    ("Kiwoom Heroes",   [r"키움", r"히어로", r"kiwoom", r"heroes"]),
    ("SSG Landers",     [r"ssg", r"랜더", r"landers", r"sk\s*wyvern", r"wyvern"]),
    ("LG Twins",        [r"\blg\b", r"트윈", r"twins"]),
    ("KIA Tigers",      [r"\bkia\b", r"타이거", r"tigers"]),
    ("KT Wiz",          [r"\bkt\b", r"위즈", r"wiz"]),
    ("NC Dinos",        [r"\bnc\b", r"다이노", r"dinos"]),
    ("Doosan Bears",    [r"두산", r"베어", r"doosan", r"bears"]),
    ("Lotte Giants",    [r"롯데", r"자이언", r"lotte", r"giants"]),
    ("Samsung Lions",   [r"삼성", r"라이온", r"samsung", r"lions"]),
    ("Hanwha Eagles",   [r"한화", r"이글", r"hanwha", r"eagles"]),
]

def normalize_team(s: str) -> str:
    if s is None or s == "" or s.lower() == "nan":
        return None

    x = s.strip()
    x_low = x.lower()

    # 1) Keyword match (most reliable)
    for canon, patterns in TEAM_RULES:
        for pat in patterns:
            if re.search(pat, x_low):
                return canon

    # 2) If it already looks like a canonical English team, keep it
    close = get_close_matches(x, CANON, n=1, cutoff=0.80)
    if close:
        return close[0]

    # 3) Could not normalize
    return None

df["team_norm"] = df["team_raw"].apply(normalize_team)

# =========================
# Diagnostics: see what failed normalization
# =========================
failed = df[df["team_norm"].isna()]["team_raw"].value_counts()
if len(failed) > 0:
    print("\n[TEAM NORMALIZATION FAILED FOR THESE RAW VALUES]")
    print(failed.head(30))

# If normalization still fails for some values, STOP and add them to TEAM_RULES.
assert df["team_norm"].notna().all(), "Some team names could not be normalized. Add patterns to TEAM_RULES."

# =========================
# ---- Now continue simulator ----
# Make sure RS_contrib / RA_contrib exist BEFORE grouping
# =========================
LEAGUE_OPS = 0.720
LEAGUE_ERA = 4.50
LEAGUE_RPA = 0.115
GAMES = 144
PYTH_EXP = 1.83

df["OPS_final"] = df["OPS_adj"].fillna(LEAGUE_OPS)
df["ERA_final"] = df["ERA_adj"].fillna(LEAGUE_ERA)

def assign_playing_time(row):
    if row["section"] == "Batters":
        return 520 if row["role"] in ["1B","2B","3B","SS","LF","CF","RF","C","DH"] else 220
    else:
        r = str(row["role"]).lower()
        if r == "starter":
            return 160
        if r in ["closer", "setup"]:
            return 65
        return 45

df["PT"] = df.apply(assign_playing_time, axis=1)

df["RS_contrib"] = np.where(
    df["section"] == "Batters",
    df["PT"] * (df["OPS_final"] / LEAGUE_OPS) * LEAGUE_RPA,
    0.0
)

df["RA_contrib"] = np.where(
    df["section"] == "Pitchers",
    df["PT"] * df["ERA_final"] / 9.0,
    0.0
)

team_RS = df.groupby("team_norm")["RS_contrib"].sum()
team_RA = df.groupby("team_norm")["RA_contrib"].sum()

teams = pd.DataFrame({
    "team": team_RS.index,
    "RS": team_RS.values,
    "RA": team_RA.reindex(team_RS.index).values
})

# Ensure exactly 10 teams
assert teams.shape[0] == 10, f"Expected 10 teams, got {teams.shape[0]}"
print("\nTeams in simulation:", teams["team"].tolist())

def pythag_wins(RS, RA):
    return GAMES * (RS**PYTH_EXP / (RS**PYTH_EXP + RA**PYTH_EXP))

teams["Expected_Wins"] = teams.apply(lambda r: pythag_wins(r["RS"], r["RA"]), axis=1)
print("\n[PYTHAGOREAN EXPECTED WINS]")
print(teams.sort_values("Expected_Wins", ascending=False))



[TEAM NORMALIZATION FAILED FOR THESE RAW VALUES]
team_raw
Ե ̾    23
ȭ̱۽    22
Ű      22
λ꺣    22
Ｚ̿     17
Name: count, dtype: int64


AssertionError: Some team names could not be normalized. Add patterns to TEAM_RULES.

In [16]:
# 0) Direct map for garbled encodings (from your diagnostic output)
GARBLED_MAP = {
    "Ű": "Kiwoom Heroes",
    "λ꺣": "Lotte Giants",
    "Ｚ̿": "Samsung Lions",
    "ȭ̱۽": "Doosan Bears",
    "Ե ̾": "Hanwha Eagles",
}

def normalize_team(s: str) -> str:
    df["team_norm"] = df["team_raw"].apply(normalize_team)
    if s is None or s == "" or str(s).lower() == "nan":
        return None

    x = str(s).strip()
    x_low = x.lower()

    # 0) Direct map (most reliable for corrupted strings)
    if x in GARBLED_MAP:
        return GARBLED_MAP[x]

    # 1) Keyword match
    for canon, patterns in TEAM_RULES:
        for pat in patterns:
            if re.search(pat, x_low):
                return canon

    # 2) Fuzzy match to canonical English
    close = get_close_matches(x, CANON, n=1, cutoff=0.80)
    if close:
        return close[0]

    return None

print(sorted([x for x in df["team_norm"].unique() if isinstance(x, str)]))
print(df["team_norm"].value_counts(dropna=False))

# IMPORTANT: re-apply normalization after the function change
df["team_norm"] = df["team_raw"].apply(normalize_team)

# show any raw team strings that still fail
failed = df[df["team_norm"].isna()]["team_raw"].value_counts()
print("[STILL FAILING RAW TEAM STRINGS]")
print(failed)




['KIA Tigers', 'KT Wiz', 'LG Twins', 'NC Dinos', 'SSG Landers']
team_norm
None           106
KT Wiz          22
SSG Landers     22
KIA Tigers      22
NC Dinos        21
LG Twins        21
Name: count, dtype: int64


RecursionError: maximum recursion depth exceeded while calling a Python object

In [17]:
import pandas as pd
import numpy as np
import re
import unicodedata

df = pd.read_csv("kbo_2026_prediction_dataset_adjusted.csv")

# 1) Clean raw team strings
def clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    # Normalize Unicode (fixes full-width chars, etc.)
    s = unicodedata.normalize("NFKC", s)
    # Remove zero-width and non-breaking spaces
    s = s.replace("\u200b", "").replace("\xa0", " ")
    # Strip
    return s.strip()

df["team_raw"] = df["team"].apply(clean_text)

# 2) Canonicalize "keys" by removing ALL whitespace + invisible marks
def canonical_key(s: str) -> str:
    s = clean_text(s)
    # Remove all whitespace
    s = re.sub(r"\s+", "", s)
    # Remove Unicode "format" and "combining mark" characters
    s = "".join(ch for ch in s if unicodedata.category(ch) not in ("Cf", "Mn"))
    return s

# 3) Direct mapping for the 5 garbled values you observed
GARBLED_MAP = {
    "Ű": "Kiwoom Heroes",
    "λ꺣": "Lotte Giants",
    "Ｚ̿": "Samsung Lions",
    "ȭ̱۽": "Doosan Bears",
    "Ե ̾": "Hanwha Eagles",
}

# Canonicalize the garbled keys too (THIS is the key fix)
GARBLED_MAP_CANON = {canonical_key(k): v for k, v in GARBLED_MAP.items()}

# 4) Keyword rules for normal (non-garbled) strings
TEAM_RULES = [
    ("Kiwoom Heroes",   [r"키움", r"히어로", r"kiwoom", r"heroes"]),
    ("SSG Landers",     [r"ssg", r"랜더", r"landers", r"wyvern", r"sk\s*wyvern"]),
    ("LG Twins",        [r"\blg\b", r"트윈", r"twins"]),
    ("KIA Tigers",      [r"\bkia\b", r"타이거", r"tigers"]),
    ("KT Wiz",          [r"\bkt\b", r"위즈", r"wiz"]),
    ("NC Dinos",        [r"\bnc\b", r"다이노", r"dinos"]),
    ("Doosan Bears",    [r"두산", r"베어", r"doosan", r"bears"]),
    ("Lotte Giants",    [r"롯데", r"자이언", r"lotte", r"giants"]),
    ("Samsung Lions",   [r"삼성", r"라이온", r"samsung", r"lions"]),
    ("Hanwha Eagles",   [r"한화", r"이글", r"hanwha", r"eagles"]),
]

CANON_TEAMS = [
    "Kiwoom Heroes", "SSG Landers", "LG Twins", "KIA Tigers", "KT Wiz",
    "NC Dinos", "Doosan Bears", "Lotte Giants", "Samsung Lions", "Hanwha Eagles"
]

def normalize_team(s: str) -> str:
    if s is None:
        return None
    x = clean_text(s)
    if x == "" or x.lower() == "nan":
        return None

    # A) Direct garbled-key mapping (robust)
    k = canonical_key(x)
    if k in GARBLED_MAP_CANON:
        return GARBLED_MAP_CANON[k]

    # B) Keyword mapping
    x_low = x.lower()
    for canon, patterns in TEAM_RULES:
        for pat in patterns:
            if re.search(pat, x_low):
                return canon

    # C) If already canonical english, keep it
    if x in CANON_TEAMS:
        return x

    return None

df["team_norm"] = df["team_raw"].apply(normalize_team)

# Diagnostics
failed = df[df["team_norm"].isna()]["team_raw"].value_counts()
print("\n[FAILED RAW TEAM VALUES]")
print(failed)

print("\n[TEAM NORM COUNTS]")
print(df["team_norm"].value_counts(dropna=False))

# Hard stop if still failing
assert df["team_norm"].notna().all(), "Still have unmapped team strings. See FAILED RAW TEAM VALUES above."
assert df["team_norm"].nunique() == 10, f"Expected 10 teams, got {df['team_norm'].nunique()}"
print("\nOK: 10 teams normalized:", sorted(df["team_norm"].unique()))



[FAILED RAW TEAM VALUES]
Series([], Name: count, dtype: int64)

[TEAM NORM COUNTS]
team_norm
Hanwha Eagles    23
KT Wiz           22
Doosan Bears     22
SSG Landers      22
Kiwoom Heroes    22
Lotte Giants     22
KIA Tigers       22
NC Dinos         21
LG Twins         21
Samsung Lions    17
Name: count, dtype: int64

OK: 10 teams normalized: ['Doosan Bears', 'Hanwha Eagles', 'KIA Tigers', 'KT Wiz', 'Kiwoom Heroes', 'LG Twins', 'Lotte Giants', 'NC Dinos', 'SSG Landers', 'Samsung Lions']


In [19]:
# ============================================================
# KBO 2026 SEASON SIMULATOR (FINAL)
# - Robust team normalization
# - Missing stat regression
# - Team RS/RA aggregation
# - Pythagorean win%
# - Monte Carlo
# - Enforces league zero-sum wins (total = 720)
# ============================================================

import pandas as pd
import numpy as np
import re
import unicodedata

INPUT_CSV  = "kbo_2026_prediction_dataset_adjusted.csv"
OUTPUT_CSV = "kbo_2026_season_simulation_results_final.csv"

# League constants (baseline)
LEAGUE_OPS = 0.720
LEAGUE_ERA = 4.50
LEAGUE_RPA = 0.115
GAMES = 144
N_TEAMS = 10
TOTAL_WINS_LEAGUE = (GAMES * N_TEAMS) / 2  # 720
PYTH_EXP = 1.83

# Monte Carlo
N_SIM = 5000
SIGMA_RS = 0.05
SIGMA_RA = 0.07
RNG_SEED = 42

# -----------------------------
# 1) Load
# -----------------------------
df = pd.read_csv(INPUT_CSV)

# -----------------------------
# 2) Robust team normalization
# -----------------------------
def clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200b", "").replace("\xa0", " ")
    return s.strip()

def canonical_key(s: str) -> str:
    s = clean_text(s)
    s = re.sub(r"\s+", "", s)
    s = "".join(ch for ch in s if unicodedata.category(ch) not in ("Cf", "Mn"))
    return s

GARBLED_MAP = {
    "Ű": "Kiwoom Heroes",
    "λ꺣": "Lotte Giants",
    "Ｚ̿": "Samsung Lions",
    "ȭ̱۽": "Doosan Bears",
    "Ե ̾": "Hanwha Eagles",
}
GARBLED_MAP_CANON = {canonical_key(k): v for k, v in GARBLED_MAP.items()}

TEAM_RULES = [
    ("Kiwoom Heroes",   [r"키움", r"히어로", r"kiwoom", r"heroes"]),
    ("SSG Landers",     [r"ssg", r"랜더", r"landers", r"wyvern", r"sk\s*wyvern"]),
    ("LG Twins",        [r"\blg\b", r"트윈", r"twins"]),
    ("KIA Tigers",      [r"\bkia\b", r"타이거", r"tigers"]),
    ("KT Wiz",          [r"\bkt\b", r"위즈", r"wiz"]),
    ("NC Dinos",        [r"\bnc\b", r"다이노", r"dinos"]),
    ("Doosan Bears",    [r"두산", r"베어", r"doosan", r"bears"]),
    ("Lotte Giants",    [r"롯데", r"자이언", r"lotte", r"giants"]),
    ("Samsung Lions",   [r"삼성", r"라이온", r"samsung", r"lions"]),
    ("Hanwha Eagles",   [r"한화", r"이글", r"hanwha", r"eagles"]),
]
CANON_TEAMS = [
    "Kiwoom Heroes","SSG Landers","LG Twins","KIA Tigers","KT Wiz",
    "NC Dinos","Doosan Bears","Lotte Giants","Samsung Lions","Hanwha Eagles"
]

def normalize_team(s: str) -> str:
    x = clean_text(s)
    if x == "" or x.lower() == "nan":
        return None

    k = canonical_key(x)
    if k in GARBLED_MAP_CANON:
        return GARBLED_MAP_CANON[k]

    x_low = x.lower()
    for canon, pats in TEAM_RULES:
        for pat in pats:
            if re.search(pat, x_low):
                return canon

    if x in CANON_TEAMS:
        return x

    return None

df["team_raw"] = df["team"].apply(clean_text)
df["team_norm"] = df["team_raw"].apply(normalize_team)

failed = df[df["team_norm"].isna()]["team_raw"].value_counts()
if len(failed) > 0:
    print("\n[TEAM NORMALIZATION FAILED FOR THESE RAW VALUES]")
    print(failed)
assert df["team_norm"].notna().all(), "Unmapped team strings remain."
assert df["team_norm"].nunique() == 10, f"Expected 10 teams, got {df['team_norm'].nunique()}"

# -----------------------------
# 3) Regress missing stats
# -----------------------------
df["OPS_final"] = df["OPS_adj"].fillna(LEAGUE_OPS)
df["ERA_final"] = df["ERA_adj"].fillna(LEAGUE_ERA)

# -----------------------------
# 4) Playing time baseline
# -----------------------------
STARTER_POS = {"1B","2B","3B","SS","LF","CF","RF","C","DH"}

def assign_playing_time(row):
    if row["section"] == "Batters":
        return 520 if str(row["role"]) in STARTER_POS else 220  # PA
    else:
        r = str(row["role"]).lower()
        if r == "starter":
            return 160  # IP
        if r in {"closer","setup"}:
            return 65
        return 45

df["PT"] = df.apply(assign_playing_time, axis=1)

# -----------------------------
# 5) Team RS/RA
# -----------------------------
df["RS_contrib"] = np.where(
    df["section"] == "Batters",
    df["PT"] * (df["OPS_final"] / LEAGUE_OPS) * LEAGUE_RPA,
    0.0
)

df["RA_contrib"] = np.where(
    df["section"] == "Pitchers",
    df["PT"] * df["ERA_final"] / 9.0,
    0.0
)

team_RS = df.groupby("team_norm")["RS_contrib"].sum()
team_RA = df.groupby("team_norm")["RA_contrib"].sum()

teams = pd.DataFrame({
    "team": team_RS.index.to_list(),
    "RS": team_RS.values,
    "RA": team_RA.reindex(team_RS.index).values
})

assert len(teams) == 10

# -----------------------------
# 6) Pythagorean win% -> wins (NOT zero-sum yet)
# -----------------------------
def pythag_winpct(RS, RA, exp=PYTH_EXP):
    RS = np.asarray(RS, dtype=float)
    RA = np.asarray(RA, dtype=float)
    return (RS**exp) / (RS**exp + RA**exp)

teams["WinPct_raw"] = pythag_winpct(teams["RS"], teams["RA"])
teams["Wins_raw"] = teams["WinPct_raw"] * GAMES

# Enforce zero-sum league wins by scaling to 720
scale = TOTAL_WINS_LEAGUE / teams["Wins_raw"].sum()
teams["Expected_Wins"] = teams["Wins_raw"] * scale

# -----------------------------
# 7) Monte Carlo (also zero-sum per simulation)
# -----------------------------
rng = np.random.default_rng(RNG_SEED)
sim_wins = np.zeros((N_SIM, len(teams)), dtype=float)

for i in range(N_SIM):
    RS_sim = teams["RS"].values * rng.normal(1.0, SIGMA_RS, size=len(teams))
    RA_sim = teams["RA"].values * rng.normal(1.0, SIGMA_RA, size=len(teams))
    RS_sim = np.clip(RS_sim, 1e-6, None)
    RA_sim = np.clip(RA_sim, 1e-6, None)

    winpct = pythag_winpct(RS_sim, RA_sim)
    wins = winpct * GAMES

    # enforce total wins = 720 in this simulation
    wins *= (TOTAL_WINS_LEAGUE / wins.sum())
    sim_wins[i, :] = wins

sim_df = pd.DataFrame(sim_wins, columns=teams["team"].values)

summary = pd.DataFrame({
    "team": teams["team"].values,
    "RS": teams["RS"].values,
    "RA": teams["RA"].values,
    "Expected_Wins": teams["Expected_Wins"].values,
    "Wins_mean": sim_df.mean().values,
    "Wins_p10": sim_df.quantile(0.10).values,
    "Wins_p90": sim_df.quantile(0.90).values,
}).sort_values("Wins_mean", ascending=False).reset_index(drop=True)

print("\n[Sanity] Total mean wins:", float(summary["Wins_mean"].sum()), "(target 720)")
print("[Teams]", summary["team"].tolist())

summary.to_csv(OUTPUT_CSV, index=False)
print("\nSaved:", OUTPUT_CSV)
print(summary)



[Sanity] Total mean wins: 719.9999999999998 (target 720)
[Teams] ['Samsung Lions', 'SSG Landers', 'Lotte Giants', 'LG Twins', 'Doosan Bears', 'KIA Tigers', 'NC Dinos', 'KT Wiz', 'Hanwha Eagles', 'Kiwoom Heroes']

Saved: kbo_2026_season_simulation_results_final.csv
            team          RS          RA  Expected_Wins  Wins_mean   Wins_p10  \
0  Samsung Lions  508.181806  465.714444      91.556810  91.389554  83.241070   
1    SSG Landers  538.647222  525.950000      86.650616  86.622100  78.421670   
2   Lotte Giants  531.245694  522.615556      86.070721  85.941174  77.860178   
3       LG Twins  488.299583  494.544444      83.813989  83.904537  75.864208   
4   Doosan Bears  537.494028  621.694444      73.574205  73.590036  65.815352   
5     KIA Tigers  509.945394  636.788889      67.797609  67.677369  60.038017   
6       NC Dinos  482.099167  614.204444      66.310550  66.331204  58.693327   
7         KT Wiz  424.817028  556.407778      64.277679  64.382268  56.646861   
8  Ha

In [23]:
# ============================================================
# KBO 2026 SEASON SIMULATOR + TRADE SCENARIO FUNCTION
# ============================================================
# - Loads kbo_2026_prediction_dataset_adjusted.csv
# - Normalizes teams robustly
# - Builds RS/RA and win distribution (zero-sum 720)
# - Adds: trade_players(p1, p2) -> preview -> confirm -> swap -> rerun -> Δ wins
# ============================================================

import pandas as pd
import numpy as np
import re
import unicodedata

# -----------------------------
# CONFIG
# -----------------------------
INPUT_CSV = "kbo_2026_prediction_dataset_adjusted.csv"

# League constants (baseline)
LEAGUE_OPS = 0.720
LEAGUE_ERA = 4.50
LEAGUE_RPA = 0.115
GAMES = 144
N_TEAMS = 10
TOTAL_WINS_LEAGUE = (GAMES * N_TEAMS) / 2  # 720
PYTH_EXP = 1.83

# Monte Carlo
N_SIM = 3000
SIGMA_RS = 0.05
SIGMA_RA = 0.07
RNG_SEED = 42

# -----------------------------
# TEAM NORMALIZATION (robust)
# -----------------------------
def clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200b", "").replace("\xa0", " ")
    return s.strip()

def canonical_key(s: str) -> str:
    s = clean_text(s)
    s = re.sub(r"\s+", "", s)
    s = "".join(ch for ch in s if unicodedata.category(ch) not in ("Cf", "Mn"))
    return s

GARBLED_MAP = {
    "Ű": "Kiwoom Heroes",
    "λ꺣": "Lotte Giants",
    "Ｚ̿": "Samsung Lions",
    "ȭ̱۽": "Doosan Bears",
    "Ե ̾": "Hanwha Eagles",
}
GARBLED_MAP_CANON = {canonical_key(k): v for k, v in GARBLED_MAP.items()}

TEAM_RULES = [
    ("Kiwoom Heroes",   [r"키움", r"히어로", r"kiwoom", r"heroes"]),
    ("SSG Landers",     [r"ssg", r"랜더", r"landers", r"wyvern", r"sk\s*wyvern"]),
    ("LG Twins",        [r"\blg\b", r"트윈", r"twins"]),
    ("KIA Tigers",      [r"\bkia\b", r"타이거", r"tigers"]),
    ("KT Wiz",          [r"\bkt\b", r"위즈", r"wiz"]),
    ("NC Dinos",        [r"\bnc\b", r"다이노", r"dinos"]),
    ("Doosan Bears",    [r"두산", r"베어", r"doosan", r"bears"]),
    ("Lotte Giants",    [r"롯데", r"자이언", r"lotte", r"giants"]),
    ("Samsung Lions",   [r"삼성", r"라이온", r"samsung", r"lions"]),
    ("Hanwha Eagles",   [r"한화", r"이글", r"hanwha", r"eagles"]),
]
CANON_TEAMS = [
    "Kiwoom Heroes","SSG Landers","LG Twins","KIA Tigers","KT Wiz",
    "NC Dinos","Doosan Bears","Lotte Giants","Samsung Lions","Hanwha Eagles"
]

def normalize_team(s: str) -> str:
    x = clean_text(s)
    if x == "" or x.lower() == "nan":
        return None

    k = canonical_key(x)
    if k in GARBLED_MAP_CANON:
        return GARBLED_MAP_CANON[k]

    x_low = x.lower()
    for canon, pats in TEAM_RULES:
        for pat in pats:
            if re.search(pat, x_low):
                return canon

    if x in CANON_TEAMS:
        return x

    return None

# -----------------------------
# LOAD + PREP DATA
# -----------------------------
df0 = pd.read_csv(INPUT_CSV)

# Normalize teams
df0["team_raw"] = df0["team"].apply(clean_text)
df0["team_norm"] = df0["team_raw"].apply(normalize_team)

failed = df0[df0["team_norm"].isna()]["team_raw"].value_counts()
if len(failed) > 0:
    print("\n[TEAM NORMALIZATION FAILED FOR THESE RAW VALUES]")
    print(failed)
assert df0["team_norm"].notna().all(), "Unmapped team strings remain."
assert df0["team_norm"].nunique() == 10, f"Expected 10 teams, got {df0['team_norm'].nunique()}"

# Regress missing stats
df0["OPS_final"] = df0.get("OPS_adj", pd.Series([np.nan]*len(df0))).fillna(LEAGUE_OPS)
df0["ERA_final"] = df0.get("ERA_adj", pd.Series([np.nan]*len(df0))).fillna(LEAGUE_ERA)

# Baseline playing time
STARTER_POS = {"1B","2B","3B","SS","LF","CF","RF","C","DH"}
def assign_playing_time(row):
    if row["section"] == "Batters":
        return 520 if str(row["role"]) in STARTER_POS else 220
    r = str(row["role"]).lower()
    if r == "starter":
        return 160
    if r in {"closer","setup"}:
        return 65
    return 45

df0["PT"] = df0.apply(assign_playing_time, axis=1)

# -----------------------------
# CORE SIMULATOR
# -----------------------------
def pythag_winpct(RS, RA, exp=PYTH_EXP):
    RS = np.asarray(RS, dtype=float)
    RA = np.asarray(RA, dtype=float)
    return (RS**exp) / (RS**exp + RA**exp)

def simulate_season(df_in: pd.DataFrame, n_sim=N_SIM, seed=RNG_SEED):
    """
    Returns:
      summary: team-level dataframe with RS, RA, Expected_Wins, Wins_mean, Wins_p10, Wins_p90
    """
    df = df_in.copy()

    # Contributions
    df["RS_contrib"] = np.where(
        df["section"] == "Batters",
        df["PT"] * (df["OPS_final"] / LEAGUE_OPS) * LEAGUE_RPA,
        0.0
    )
    df["RA_contrib"] = np.where(
        df["section"] == "Pitchers",
        df["PT"] * df["ERA_final"] / 9.0,
        0.0
    )

    team_RS = df.groupby("team_norm")["RS_contrib"].sum()
    team_RA = df.groupby("team_norm")["RA_contrib"].sum()

    teams = pd.DataFrame({
        "team": team_RS.index.to_list(),
        "RS": team_RS.values,
        "RA": team_RA.reindex(team_RS.index).values
    })
    assert len(teams) == 10, f"Expected 10 teams, got {len(teams)}"

    # Expected wins (scaled to league total 720)
    winpct = pythag_winpct(teams["RS"], teams["RA"])
    wins_raw = winpct * GAMES
    scale = TOTAL_WINS_LEAGUE / wins_raw.sum()
    teams["Expected_Wins"] = wins_raw * scale

    # Monte Carlo (also scaled to 720 each sim)
    rng = np.random.default_rng(seed)
    sim_wins = np.zeros((n_sim, len(teams)), dtype=float)

    for i in range(n_sim):
        RS_sim = teams["RS"].values * rng.normal(1.0, SIGMA_RS, size=len(teams))
        RA_sim = teams["RA"].values * rng.normal(1.0, SIGMA_RA, size=len(teams))
        RS_sim = np.clip(RS_sim, 1e-6, None)
        RA_sim = np.clip(RA_sim, 1e-6, None)

        winpct_i = pythag_winpct(RS_sim, RA_sim)
        wins_i = winpct_i * GAMES
        wins_i *= (TOTAL_WINS_LEAGUE / wins_i.sum())
        sim_wins[i, :] = wins_i

    sim_df = pd.DataFrame(sim_wins, columns=teams["team"].values)

    summary = pd.DataFrame({
        "team": teams["team"].values,
        "RS": teams["RS"].values,
        "RA": teams["RA"].values,
        "Expected_Wins": teams["Expected_Wins"].values,
        "Wins_mean": sim_df.mean().values,
        "Wins_p10": sim_df.quantile(0.10).values,
        "Wins_p90": sim_df.quantile(0.90).values,
    }).sort_values("Wins_mean", ascending=False).reset_index(drop=True)

    return summary

# Baseline results (optional)
baseline_summary = simulate_season(df0)

# -----------------------------
# TRADE FUNCTION
# -----------------------------
def _find_player_rows(df: pd.DataFrame, player_name: str):
    """
    Tries to match player by exact name, then case-insensitive contains.
    Returns a dataframe of matching rows.
    """
    name_col = "player" if "player" in df.columns else ("Name" if "Name" in df.columns else None)
    if name_col is None:
        raise ValueError("No player name column found. Expected 'player' or 'Name'.")

    # exact match first
    exact = df[df[name_col].astype(str) == str(player_name)]
    if len(exact) > 0:
        return exact, name_col

    # contains match
    pat = re.escape(str(player_name).strip())
    contains = df[df[name_col].astype(str).str.contains(pat, case=False, na=False)]
    return contains, name_col

def trade_players(df_base: pd.DataFrame,
                  player1: str,
                  player2: str,
                  confirm: bool = True,
                  n_sim: int = N_SIM,
                  seed: int = RNG_SEED):
    """
    Interactive trade:
      1) Shows matched rows for both players (for user verification)
      2) If confirmed, swaps team_norm between the two selected rows (first match each)
      3) Reruns season sim and returns Δ wins vs baseline

    Notes:
      - If multiple rows match a name, it uses the FIRST match; you can refine the query.
      - Swap is only on team_norm (team assignment), not stats.
    """
    base_summary = simulate_season(df_base, n_sim=n_sim, seed=seed)

    m1, name_col = _find_player_rows(df_base, player1)
    m2, _ = _find_player_rows(df_base, player2)

    if len(m1) == 0 or len(m2) == 0:
        raise ValueError(
            f"Player match failed. Matches: {player1}={len(m1)} rows, {player2}={len(m2)} rows.\n"
            f"Try more specific names."
        )

    # Preview key columns
    preview_cols = [c for c in [
        name_col, "section", "role", "team_norm", "prev_league",
        "OPS_final", "ERA_final", "PT"
    ] if c in df_base.columns]

    print("\n[PLAYER 1 MATCHES]")
    print(m1[preview_cols].head(10).to_string(index=True))
    print("\n[PLAYER 2 MATCHES]")
    print(m2[preview_cols].head(10).to_string(index=True))

    # Choose first match per player
    idx1 = m1.index[0]
    idx2 = m2.index[0]

    if confirm:
        ans = input("\nProceed with trade by swapping these two players' teams? (y/n): ").strip().lower()
        if ans not in ("y", "yes"):
            print("Trade cancelled.")
            return base_summary, None, None

    # Apply swap
    df_new = df_base.copy()
    t1 = df_new.loc[idx1, "team_norm"]
    t2 = df_new.loc[idx2, "team_norm"]
    df_new.loc[idx1, "team_norm"] = t2
    df_new.loc[idx2, "team_norm"] = t1

    new_summary = simulate_season(df_new, n_sim=n_sim, seed=seed)

    # Δ wins table
    delta = new_summary.merge(
        base_summary[["team", "Wins_mean"]],
        on="team",
        how="left",
        suffixes=("_new", "_base")
    )
    delta["Delta_Wins_mean"] = delta["Wins_mean_new"] - delta["Wins_mean_base"]
    delta = delta.sort_values("Delta_Wins_mean", ascending=False).reset_index(drop=True)

    print("\n[Δ WINS (mean) AFTER TRADE]")
    print(delta[["team", "Wins_mean_base", "Wins_mean_new", "Delta_Wins_mean"]].to_string(index=False))

    return base_summary, new_summary, delta

# -----------------------------
# USAGE EXAMPLE
# -----------------------------
# 1) See baseline:
# print(baseline_summary)
#
# 2) Run a trade:
# base, new, delta = trade_players(df0, "Joo Hwan Choi", "Some Other Player", confirm=True)


In [None]:
print(baseline_summary)
base, new, delta = trade_players(df0, "Joo Hwan Choi", "Son Ah-seop", confirm=True)


            team          RS          RA  Expected_Wins  Wins_mean   Wins_p10  \
0  Samsung Lions  508.181806  465.714444      91.556810  91.413832  83.369079   
1    SSG Landers  538.647222  525.950000      86.650616  86.689896  78.589792   
2   Lotte Giants  531.245694  522.615556      86.070721  85.991454  78.002984   
3       LG Twins  488.299583  494.544444      83.813989  83.932696  75.956077   
4   Doosan Bears  537.494028  621.694444      73.574205  73.666017  66.030233   
5     KIA Tigers  509.945394  636.788889      67.797609  67.635696  59.988322   
6       NC Dinos  482.099167  614.204444      66.310550  66.222700  58.493276   
7         KT Wiz  424.817028  556.407778      64.277679  64.310113  56.584571   
8  Hanwha Eagles  519.426250  694.538889      62.774089  62.839820  55.180866   
9  Kiwoom Heroes  482.429217  965.894444      37.173733  37.297776  31.736340   

    Wins_p90  
0  99.517119  
1  95.153840  
2  94.294880  
3  92.088974  
4  81.627762  
5  75.379043  
6  

In [27]:
import pandas as pd
import numpy as np
import re
import unicodedata

# =============================
# FILES
# =============================
ROSTER_INPUT_CSV = "kbo_2026_prediction_dataset_adjusted.csv"
KBO_BATTING_TEAM_CSV = "KBO_batting_2021_2025_merged.csv"
KBO_PITCHING_TEAM_CSV = "KBO_pitching_2021_2025_merged.csv"
OUTPUT_CSV = "kbo_2026_season_simulation_results_final.csv"

# =============================
# LEAGUE CONSTANTS
# =============================
LEAGUE_OPS = 0.720
LEAGUE_ERA = 4.50
LEAGUE_RPA = 0.115
GAMES = 144
N_TEAMS = 10
TOTAL_WINS_LEAGUE = (GAMES * N_TEAMS) / 2  # 720
PYTH_EXP = 1.83

# Monte Carlo
N_SIM = 5000
SIGMA_RS = 0.05
SIGMA_RA = 0.07
RNG_SEED = 42

# Team-memory regression weights
# alpha=1.0 -> pure roster model
# alpha=0.0 -> pure last-season team results
ALPHA_RS = 0.65
ALPHA_RA = 0.65
MEMORY_SEASON = 2025

# =============================
# TEAM NORMALIZATION (robust)
# =============================
def clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200b", "").replace("\xa0", " ")
    return s.strip()

def canonical_key(s: str) -> str:
    s = clean_text(s)
    s = re.sub(r"\s+", "", s)
    s = "".join(ch for ch in s if unicodedata.category(ch) not in ("Cf", "Mn"))
    return s

# Garbled keys seen in your data
GARBLED_MAP = {
    "Ű": "Kiwoom Heroes",
    "λ꺣": "Lotte Giants",
    "Ｚ̿": "Samsung Lions",
    "ȭ̱۽": "Doosan Bears",
    "Ե ̾": "Hanwha Eagles",
}
GARBLED_MAP_CANON = {canonical_key(k): v for k, v in GARBLED_MAP.items()}

# Keyword rules (works for: Korean, English, abbreviations)
TEAM_RULES = [
    ("Kiwoom Heroes",   [r"키움", r"히어로", r"kiwoom", r"heroes"]),
    ("SSG Landers",     [r"ssg", r"랜더", r"landers", r"wyvern", r"sk\s*wyvern", r"skwyverns"]),
    ("LG Twins",        [r"\blg\b", r"트윈", r"twins"]),
    ("KIA Tigers",      [r"\bkia\b", r"타이거", r"tigers"]),
    ("KT Wiz",          [r"\bkt\b", r"위즈", r"wiz"]),
    ("NC Dinos",        [r"\bnc\b", r"다이노", r"dinos"]),
    ("Doosan Bears",    [r"두산", r"베어", r"doosan", r"bears"]),
    ("Lotte Giants",    [r"롯데", r"자이언", r"lotte", r"giants"]),
    ("Samsung Lions",   [r"삼성", r"라이온", r"samsung", r"lions"]),
    ("Hanwha Eagles",   [r"한화", r"이글", r"hanwha", r"eagles"]),
]
CANON_TEAMS = [
    "Kiwoom Heroes","SSG Landers","LG Twins","KIA Tigers","KT Wiz",
    "NC Dinos","Doosan Bears","Lotte Giants","Samsung Lions","Hanwha Eagles"
]

def normalize_team(s: str) -> str:
    x = clean_text(s)
    if x == "" or x.lower() == "nan":
        return None

    k = canonical_key(x)
    if k in GARBLED_MAP_CANON:
        return GARBLED_MAP_CANON[k]

    x_low = x.lower()
    for canon, pats in TEAM_RULES:
        for pat in pats:
            if re.search(pat, x_low):
                return canon

    if x in CANON_TEAMS:
        return x

    return None

# =============================
# TEAM-MEMORY: get RS/RA from 2025 team stats
# =============================
def load_team_memory_rs_ra(season=MEMORY_SEASON, games=GAMES):
    bat = pd.read_csv(KBO_BATTING_TEAM_CSV)
    pit = pd.read_csv(KBO_PITCHING_TEAM_CSV)

    for d in (bat, pit):
        if "season" not in d.columns:
            raise ValueError("Expected a 'season' column in team-level KBO files.")
        if "Tm" not in d.columns:
            raise ValueError("Expected a 'Tm' column in team-level KBO files (team name).")

    if "R/G" not in bat.columns:
        raise ValueError("Expected 'R/G' in KBO_batting_2021_2025_merged.csv")
    if "RA9" not in pit.columns:
        raise ValueError("Expected 'RA9' in KBO_pitching_2021_2025_merged.csv")

    bat = bat[bat["season"] == season].copy()
    pit = pit[pit["season"] == season].copy()

    # remove league totals / aggregate rows
    bat = bat[~bat["Tm"].astype(str).str.contains("League Totals", case=False, na=False)].copy()
    pit = pit[~pit["Tm"].astype(str).str.contains("League Totals", case=False, na=False)].copy()

    bat["team_norm"] = bat["Tm"].apply(normalize_team)
    pit["team_norm"] = pit["Tm"].apply(normalize_team)

    bat_fail = bat[bat["team_norm"].isna()]["Tm"].value_counts()
    pit_fail = pit[pit["team_norm"].isna()]["Tm"].value_counts()
    if len(bat_fail) > 0 or len(pit_fail) > 0:
        print("\n[TEAM MEMORY NORMALIZATION FAILURES]")
        if len(bat_fail) > 0:
            print("Batting team labels failing normalization:")
            print(bat_fail)
        if len(pit_fail) > 0:
            print("Pitching team labels failing normalization:")
            print(pit_fail)
        raise ValueError("Fix TEAM_RULES to cover the team labels above.")

    bat["RS_mem"] = bat["R/G"].astype(float) * games
    pit["RA_mem"] = pit["RA9"].astype(float) * games

    mem = pd.merge(
        bat[["team_norm","RS_mem"]],
        pit[["team_norm","RA_mem"]],
        on="team_norm",
        how="inner"
    )

    assert mem["team_norm"].nunique() == 10, f"Expected 10 teams in memory, got {mem['team_norm'].nunique()}"
    return mem


# =============================
# CORE SIMULATOR
# =============================
def pythag_winpct(RS, RA, exp=PYTH_EXP):
    RS = np.asarray(RS, dtype=float)
    RA = np.asarray(RA, dtype=float)
    return (RS**exp) / (RS**exp + RA**exp)

def assign_playing_time(row):
    STARTER_POS = {"1B","2B","3B","SS","LF","CF","RF","C","DH"}
    if row["section"] == "Batters":
        return 520 if str(row["role"]) in STARTER_POS else 220
    r = str(row["role"]).lower()
    if r == "starter":
        return 160
    if r in {"closer","setup"}:
        return 65
    return 45

def simulate_season_with_team_memory(df_roster: pd.DataFrame,
                                    alpha_rs=ALPHA_RS,
                                    alpha_ra=ALPHA_RA,
                                    n_sim=N_SIM,
                                    seed=RNG_SEED):
    df = df_roster.copy()

    # regress missing player stats
    df["OPS_final"] = df["OPS_adj"].fillna(LEAGUE_OPS)
    df["ERA_final"] = df["ERA_adj"].fillna(LEAGUE_ERA)

    # playing time
    df["PT"] = df.apply(assign_playing_time, axis=1)

    # roster RS/RA contributions
    df["RS_contrib"] = np.where(
        df["section"] == "Batters",
        df["PT"] * (df["OPS_final"] / LEAGUE_OPS) * LEAGUE_RPA,
        0.0
    )
    df["RA_contrib"] = np.where(
        df["section"] == "Pitchers",
        df["PT"] * df["ERA_final"] / 9.0,
        0.0
    )

    team_RS_roster = df.groupby("team_norm")["RS_contrib"].sum()
    team_RA_roster = df.groupby("team_norm")["RA_contrib"].sum()

    teams = pd.DataFrame({
        "team": team_RS_roster.index.to_list(),
        "RS_roster": team_RS_roster.values,
        "RA_roster": team_RA_roster.reindex(team_RS_roster.index).values
    })
    assert len(teams) == 10, f"Expected 10 teams, got {len(teams)}"

    # Load memory RS/RA from 2025 team stats and merge
    mem = load_team_memory_rs_ra(season=MEMORY_SEASON, games=GAMES).rename(columns={"team_norm":"team"})
    teams = teams.merge(mem, on="team", how="left")
    assert teams["RS_mem"].notna().all() and teams["RA_mem"].notna().all(), "Missing memory RS/RA for some teams."

    # Team-memory regression blend
    teams["RS_final"] = alpha_rs * teams["RS_roster"] + (1 - alpha_rs) * teams["RS_mem"]
    teams["RA_final"] = alpha_ra * teams["RA_roster"] + (1 - alpha_ra) * teams["RA_mem"]

    # Expected wins (scaled to 720)
    winpct = pythag_winpct(teams["RS_final"], teams["RA_final"])
    wins_raw = winpct * GAMES
    teams["Expected_Wins"] = wins_raw * (TOTAL_WINS_LEAGUE / wins_raw.sum())

    # Monte Carlo (scaled to 720 each sim)
    rng = np.random.default_rng(seed)
    sim_wins = np.zeros((n_sim, len(teams)), dtype=float)

    for i in range(n_sim):
        RS_sim = teams["RS_final"].values * rng.normal(1.0, SIGMA_RS, size=len(teams))
        RA_sim = teams["RA_final"].values * rng.normal(1.0, SIGMA_RA, size=len(teams))
        RS_sim = np.clip(RS_sim, 1e-6, None)
        RA_sim = np.clip(RA_sim, 1e-6, None)

        winpct_i = pythag_winpct(RS_sim, RA_sim)
        wins_i = winpct_i * GAMES
        wins_i *= (TOTAL_WINS_LEAGUE / wins_i.sum())
        sim_wins[i, :] = wins_i

    sim_df = pd.DataFrame(sim_wins, columns=teams["team"].values)

    summary = pd.DataFrame({
        "team": teams["team"].values,
        "RS_roster": teams["RS_roster"].values,
        "RA_roster": teams["RA_roster"].values,
        "RS_mem_2025": teams["RS_mem"].values,
        "RA_mem_2025": teams["RA_mem"].values,
        "RS_final": teams["RS_final"].values,
        "RA_final": teams["RA_final"].values,
        "Expected_Wins": teams["Expected_Wins"].values,
        "Wins_mean": sim_df.mean().values,
        "Wins_p10": sim_df.quantile(0.10).values,
        "Wins_p90": sim_df.quantile(0.90).values,
    }).sort_values("Wins_mean", ascending=False).reset_index(drop=True)

    return summary

# =============================
# RUN
# =============================
roster = pd.read_csv(ROSTER_INPUT_CSV)
roster["team_norm"] = roster["team"].apply(normalize_team)
assert roster["team_norm"].notna().all(), "Roster team normalization failed (update TEAM_RULES/GARBLED_MAP)."
assert roster["team_norm"].nunique() == 10, f"Expected 10 teams in roster, got {roster['team_norm'].nunique()}"

summary = simulate_season_with_team_memory(roster, alpha_rs=ALPHA_RS, alpha_ra=ALPHA_RA)

print("\n[Sanity] Total mean wins:", float(summary["Wins_mean"].sum()), "(target 720)")
print(summary[["team","RS_final","RA_final","Wins_mean","Wins_p10","Wins_p90"]])

summary.to_csv(OUTPUT_CSV, index=False)
print("\nSaved:", OUTPUT_CSV)



[Sanity] Total mean wins: 719.9999999999999 (target 720)
            team    RS_final    RA_final  Wins_mean   Wins_p10   Wins_p90
0  Samsung Lions  601.470174  534.050389  88.149566  80.519848  95.853631
1       LG Twins  593.082729  528.597889  88.058183  80.555095  95.757054
2    SSG Landers  563.312694  545.483500  82.004178  74.268436  89.753665
3   Lotte Giants  581.685701  605.308111  76.703503  69.144542  84.418238
4  Hanwha Eagles  578.539062  645.994278  71.599710  63.982231  79.310091
5   Doosan Bears  575.667118  647.029389  71.201295  63.834940  78.849338
6       NC Dinos  569.396458  672.904889  67.584935  60.229762  75.231775
7         KT Wiz  502.931068  595.017056  67.572337  60.019971  75.250954
8     KIA Tigers  565.320506  674.984778  66.737088  59.435914  74.121541
9  Kiwoom Heroes  516.690991  935.271389  40.389207  34.639856  46.297148

Saved: kbo_2026_season_simulation_results_final.csv
