Unnamed: 0,season,team,section,role,player,age,stats_present
0,2026,Pitchers,,Starter,Ko Young-pyo,35.0,True
1,2026,Pitchers,,Starter,So Hyeong-jun,25.0,True
2,2026,Pitchers,,Starter,Oh Won-seok,25.0,True
3,2026,Pitchers,,Starter,Matt Sauer,26.0,True
4,2026,Pitchers,,Starter,Caleb Boushley,32.0,True


Unnamed: 0,team,player,section,role
13,Batters,Starters:,,Closer
28,Pitchers,Kim Mu-sin,,Bullpen
31,Batters,Starters:,,Closer
54,Batters,Starters:,,Closer
77,Batters,Starters:,,Closer
100,Batters,Starters:,,Closer
123,Batters,Starters:,,Closer
145,Batters,Starters:,,Closer
168,Batters,Starters:,,Closer
172,Batters,3B: Han Dong-hee,,Closer


In [2]:
import pandas as pd
import numpy as np
import re
import unicodedata

# =============================
# INPUT / OUTPUT PATHS
# =============================
ROSTER_2026_CSV = "kbo2026_roster_from_txt_modelready.csv"
KBO_BATTING_TEAM_CSV = "KBO_batting_2021_2025_merged.csv"
KBO_PITCHING_TEAM_CSV = "KBO_pitching_2021_2025_merged.csv"
OUTPUT_CSV = "kbo_2026_season_sim_results_from_txt.csv"

# =============================
# LEAGUE CONSTANTS
# =============================
GAMES = 144
N_TEAMS = 10
TOTAL_WINS_LEAGUE = (GAMES * N_TEAMS) / 2  
PYTH_EXP = 1.83

LEAGUE_OPS = 0.720
LEAGUE_ERA = 4.50
LEAGUE_WHIP = 1.40
LEAGUE_RPA = 0.115

ALPHA_RS = 0.65
ALPHA_RA = 0.65
BETA_WINS = 0.85
MEMORY_SEASON = 2025

W_ERA = 0.65
W_WHIP = 0.35

N_SIM = 5000
SIGMA_RS = 0.05
SIGMA_RA = 0.07
WIN_NOISE = 0.08
RNG_SEED = 42

# =============================
# TEAM NORMALIZATION
# =============================
def clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200b", "").replace("\xa0", " ")
    return s.strip()

TEAM_RULES = [
    ("Kiwoom Heroes",   [r"키움", r"히어로", r"kiwoom", r"heroes"]),
    ("SSG Landers",     [r"ssg", r"랜더", r"landers", r"wyvern", r"sk\s*wyvern"]),
    ("LG Twins",        [r"\blg\b", r"트윈", r"twins"]),
    ("KIA Tigers",      [r"\bkia\b", r"타이거", r"tigers"]),
    ("KT Wiz",          [r"\bkt\b", r"위즈", r"wiz"]),
    ("NC Dinos",        [r"\bnc\b", r"다이노", r"dinos"]),
    ("Doosan Bears",    [r"두산", r"베어", r"doosan", r"bears"]),
    ("Lotte Giants",    [r"롯데", r"자이언", r"lotte", r"giants"]),
    ("Samsung Lions",   [r"삼성", r"라이온", r"samsung", r"lions"]),
    ("Hanwha Eagles",   [r"한화", r"이글", r"hanwha", r"eagles", r"hanhwa"]),
]

def normalize_team(s: str) -> str:
    x = clean_text(s)
    if x == "" or x.lower() == "nan":
        return None
    x_low = x.lower()
    for canon, pats in TEAM_RULES:
        for pat in pats:
            if re.search(pat, x_low):
                return canon
    return None

LEAGUE_MULT = {
    "KBO":  {"ops": 1.00, "era": 1.00},
    "MLB":  {"ops": 0.96, "era": 1.18},
    "AAA":  {"ops": 0.92, "era": 1.10},
    "NPB":  {"ops": 0.97, "era": 1.08},
    "CPBL": {"ops": 0.90, "era": 1.22},
    "MiLB": {"ops": 0.91, "era": 1.12},
}

def get_league_mult(prev_league: str):
    key = clean_text(prev_league).upper()
    if key in LEAGUE_MULT:
        return LEAGUE_MULT[key]
    return {"ops": 0.92, "era": 1.12}

def load_team_memory_rs_ra(season=MEMORY_SEASON, games=GAMES):
    bat = pd.read_csv(KBO_BATTING_TEAM_CSV)
    pit = pd.read_csv(KBO_PITCHING_TEAM_CSV)
    bat = bat[bat["season"] == season].copy()
    pit = pit[pit["season"] == season].copy()
    bat = bat[~bat["Tm"].astype(str).str.contains("League Totals", case=False, na=False)].copy()
    pit = pit[~pit["Tm"].astype(str).str.contains("League Totals", case=False, na=False)].copy()
    bat["team_norm"] = bat["Tm"].apply(normalize_team)
    pit["team_norm"] = pit["Tm"].apply(normalize_team)
    bat["RS_mem"] = bat["R/G"].astype(float) * games
    pit["RA_mem"] = pit["RA9"].astype(float) * games
    mem = pd.merge(bat[["team_norm","RS_mem"]], pit[["team_norm","RA_mem"]], on="team_norm", how="inner")
    return mem.rename(columns={"team_norm":"team"})

def load_team_memory_wins(season=MEMORY_SEASON):
    # FIXED: 'W' is in the pitching file
    pit = pd.read_csv(KBO_PITCHING_TEAM_CSV)
    pit = pit[pit["season"] == season].copy()
    pit = pit[~pit["Tm"].astype(str).str.contains("League Totals", case=False, na=False)].copy()
    pit["team_norm"] = pit["Tm"].apply(normalize_team)
    wins = pit[["team_norm","W"]].rename(columns={"team_norm":"team","W":"Wins_mem"})
    return wins

def pythag_winpct(RS, RA, exp=PYTH_EXP):
    RS = np.asarray(RS, dtype=float)
    RA = np.asarray(RA, dtype=float)
    return (RS**exp) / (RS**exp + RA**exp)

def assign_playing_time(row):
    if row["section"] == "Batters":
        starter_pos = {"1B","2B","3B","SS","LF","CF","RF","C","DH"}
        return 520 if str(row["role"]) in starter_pos else 220
    r = str(row["role"]).lower()
    if "starter" in r: return 160
    if "closer" in r or "setup" in r: return 65
    return 45

def build_team_projection(df):
    df = df.copy()
    df["team_norm"] = df["team"].apply(normalize_team)
    
    for col in ["OPS_adj", "ERA_adj", "WHIP_adj"]:
        if col not in df.columns: df[col] = np.nan

    for i, row in df.iterrows():
        mult = get_league_mult(row.get("prev_league", ""))
        if row["section"] == "Batters" and pd.isna(row.get("OPS_adj")):
             if pd.notna(row.get("OPS")):
                 df.at[i, "OPS_adj"] = float(row["OPS"]) * mult["ops"]
        if row["section"] == "Pitchers":
            if pd.isna(row.get("ERA_adj")) and pd.notna(row.get("ERA")):
                df.at[i, "ERA_adj"] = float(row["ERA"]) * mult["era"]
            if pd.isna(row.get("WHIP_adj")) and pd.notna(row.get("WHIP")):
                df.at[i, "WHIP_adj"] = float(row["WHIP"])

    df["OPS_final"] = pd.to_numeric(df["OPS_adj"], errors='coerce').fillna(LEAGUE_OPS)
    df["ERA_final"] = pd.to_numeric(df["ERA_adj"], errors='coerce').fillna(LEAGUE_ERA)
    df["WHIP_final"] = pd.to_numeric(df["WHIP_adj"], errors='coerce').fillna(LEAGUE_WHIP)

    ERA_from_WHIP = LEAGUE_ERA * (df["WHIP_final"] / LEAGUE_WHIP)
    df["ERA_eff"] = np.where(df["section"] == "Pitchers", W_ERA * df["ERA_final"] + W_WHIP * ERA_from_WHIP, np.nan)
    df["PT"] = df.apply(assign_playing_time, axis=1)

    df["RS_contrib"] = np.where(df["section"] == "Batters", df["PT"] * (df["OPS_final"] / LEAGUE_OPS) * LEAGUE_RPA, 0.0)
    df["RA_contrib"] = np.where(df["section"] == "Pitchers", df["PT"] * (df["ERA_eff"].fillna(df["ERA_final"]) / 9.0), 0.0)

    team_RS_roster = df.groupby("team_norm")["RS_contrib"].sum()
    team_RA_roster = df.groupby("team_norm")["RA_contrib"].sum()

    teams = pd.DataFrame({
        "team": team_RS_roster.index,
        "RS_roster": team_RS_roster.values,
        "RA_roster": team_RA_roster.reindex(team_RS_roster.index).values
    })

    mem = load_team_memory_rs_ra()
    teams = teams.merge(mem, on="team", how="left")
    teams["RS_final"] = ALPHA_RS * teams["RS_roster"] + (1 - ALPHA_RS) * teams["RS_mem"]
    teams["RA_final"] = ALPHA_RA * teams["RA_roster"] + (1 - ALPHA_RA) * teams["RA_mem"]

    winpct = pythag_winpct(teams["RS_final"], teams["RA_final"])
    wins_model = winpct * GAMES
    wins_model *= (TOTAL_WINS_LEAGUE / wins_model.sum())
    teams["Wins_model"] = wins_model

    wins_mem = load_team_memory_wins()
    teams = teams.merge(wins_mem, on="team", how="left")
    teams["Expected_Wins"] = BETA_WINS * teams["Wins_model"] + (1 - BETA_WINS) * teams["Wins_mem"]
    teams["Expected_Wins"] *= (TOTAL_WINS_LEAGUE / teams["Expected_Wins"].sum())

    return teams, df

def monte_carlo_wins(teams):
    rng = np.random.default_rng(RNG_SEED)
    sim = np.zeros((N_SIM, len(teams)), dtype=float)
    for i in range(N_SIM):
        RS_sim = teams["RS_final"].values * rng.normal(1.0, SIGMA_RS, size=len(teams))
        RA_sim = teams["RA_final"].values * rng.normal(1.0, SIGMA_RA, size=len(teams))
        # Ensure non-zero values for Pyth theorem
        winpct_i = pythag_winpct(np.clip(RS_sim, 1, None), np.clip(RA_sim, 1, None))
        wins_i = winpct_i * GAMES
        wins_i = 0.75 * (wins_i * (TOTAL_WINS_LEAGUE / wins_i.sum())) + 0.25 * teams["Expected_Wins"].values
        wins_i = wins_i * rng.normal(1.0, WIN_NOISE, size=len(teams))
        wins_i *= (TOTAL_WINS_LEAGUE / wins_i.sum())
        sim[i, :] = wins_i
    return pd.DataFrame(sim, columns=teams["team"].values)

df_roster = pd.read_csv(ROSTER_2026_CSV)
teams, df_players = build_team_projection(df_roster)
sim_df = monte_carlo_wins(teams)

out = teams[["team","RS_roster","RA_roster","RS_mem","RA_mem","RS_final","RA_final","Wins_mem","Wins_model","Expected_Wins"]].copy()
out["Wins_mean"] = sim_df.mean().values
out["Wins_p10"] = sim_df.quantile(0.10).values
out["Wins_p90"] = sim_df.quantile(0.90).values
out = out.sort_values("Wins_mean", ascending=False).reset_index(drop=True)

print(out[["team","RS_final","RA_final","Expected_Wins","Wins_mean","Wins_p10","Wins_p90"]])
out.to_csv(OUTPUT_CSV, index=False)

            team    RS_final    RA_final  Expected_Wins  Wins_mean   Wins_p10  \
0       LG Twins  567.691151  567.453369      87.676513  87.530043  76.768032   
1  Samsung Lions  624.711942  641.338508      84.191884  85.159383  74.786080   
2    SSG Landers  535.303039  578.162275      80.875366  81.344800  71.421476   
3   Doosan Bears  543.980925  620.349846      75.078439  76.526566  66.857901   
4  Hanwha Eagles  566.004484  656.792955      77.221676  76.049189  66.223718   
5     KIA Tigers  583.201869  702.849252      71.978189  72.709752  63.473424   
6         KT Wiz  529.165494  642.112986      72.426897  72.399837  63.079233   
7       NC Dinos  544.327302  714.418912      67.292703  66.657579  58.051541   
8   Lotte Giants  550.102498  774.249886      62.114002  61.256028  53.161283   
9  Kiwoom Heroes  501.906604  978.702511      41.144329  40.366824  34.259405   

    Wins_p90  
0  97.950218  
1  95.819853  
2  91.874463  
3  86.613108  
4  85.973359  
5  82.432034  
6  