In [None]:
from pathlib import Path
import pandas as pd
from pandas.errors import EmptyDataError

# If notebook_context set this, we don't actually need it here,
# because this notebook sits in the data folder with the CSVs.
base_dir = Path.cwd()
print("Using base_dir:", base_dir)

sources = [
    ("NBA_official_nbainjuries", base_dir / "Injury_Overrides.csv"),
    ("Balldontlie",              base_dir / "Injury_Overrides.csv"),
    ("ESPN",                     base_dir / "Injury_Overrides_live_espn.csv"),
    ("DataDriven",               base_dir / "Injury_Overrides.csv"),
    ("Manual",                   base_dir / "Injury_Overrides.csv"),
]
# ... rest of the cell unchanged ...


frames = []
for label, path in sources:
    if not path.exists() or path.stat().st_size == 0:
        print(f"⚠️ Skipping {label}: file missing or empty at {path}")
        continue

    try:
        df = pd.read_csv(path)
    except EmptyDataError:
        print(f"⚠️ Skipping {label}: EmptyDataError in {path.name}")
        continue

    # Normalize key columns
    if "Team" not in df.columns:
        print(f"⚠️ {label}: missing 'Team' column, skipping this source.")
        continue

    # Try to normalize player column
    if "PlayerName" not in df.columns:
        if "Player" in df.columns:
            df["PlayerName"] = df["Player"]
        else:
            print(f"⚠️ {label}: missing 'PlayerName'/'Player' column, skipping.")
            continue

    # Ensure basic columns exist
    for col, default in [
        ("Status", ""),
        ("Injury", ""),
        ("EstReturn", ""),
        ("Impact", 1.0),
    ]:
        if col not in df.columns:
            df[col] = default

    # Tag with Source label (overrides whatever Source might have been)
    df["Source"] = label

    frames.append(df[["Team", "PlayerName", "Status", "Injury", "EstReturn", "Impact", "Source"]])

if not frames:
    print("❌ No usable injury data found in any source; master file will NOT be updated.")
else:
    all_inj = pd.concat(frames, ignore_index=True)

    # --- Resolve duplicates by priority ---
    # If the same (Team, PlayerName) appears in multiple sources,
    # we keep the row from the highest-priority source.
    priority_order = ["Manual", "DataDriven", "NBA_official_nbainjuries", "Balldontlie", "ESPN"]
    priority_map = {name: rank for rank, name in enumerate(priority_order, start=1)}

    all_inj["priority"] = all_inj["Source"].map(priority_map).fillna(0)

    all_inj = (
        all_inj.sort_values(["Team", "PlayerName", "priority"], ascending=[True, True, False])
        .drop_duplicates(subset=["Team", "PlayerName"], keep="first")
        .drop(columns=["priority"])
        .reset_index(drop=True)
    )

    master_path = base_dir / "Injury_Overrides.csv"
    all_inj.to_csv(master_path, index=False)

    print("✅ Master injury file built.")
    print("   Rows in master:", len(all_inj))
    print("   Saved to:", master_path)

    display(all_inj.head(20))
