In [None]:
#!/usr/bin/env python3
# prepare_data_my_approach.py
#
# Builds requiredFile_{feature_set}.pkl for 43 or 45 features.
# - Loads finalData.csv and vaso.csv (vasopressor totals)
# - Merges on (icustay_id, bin)
# - Creates vaso_total; builds 5×5 action bins (fluid_bin × vaso_bin -> 0..24)
# - Reward: r = 0.6*(SOFA_t - SOFA_{t+1}); terminal += +/-24 by 90D_Mortality (assumed 1=survive, 0=death)
# - Makes X, Xnext, Action, Reward, Done; splits by patient; z-normalizes w/ train stats

import argparse
import os
import numpy as np
import pandas as pd
from typing import List, Tuple

BETA_S = 0.6
BETA_T = 24.0
N_FLUID = 5
N_VASO  = 5

DROP_ID_COLS = ["Unnamed: 0", "hadm_id", "icustay_id", "subject_id"]
RACE_OHE_COLS = ["race_asian","race_black","race_latino","race_white","race_other"]

def load_main(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    if "bin" in df.columns:
        df["bin"] = pd.to_numeric(df["bin"], errors="coerce").astype("Int64")
    if "90D_Mortality" in df.columns:
        df["90D_Mortality"] = pd.to_numeric(df["90D_Mortality"], errors="coerce")
    return df

def load_vaso(vaso_path: str) -> pd.DataFrame:
    v = pd.read_csv(vaso_path)
    for c in ["rate_norepinephrine","rate_epinephrine","rate_dopamine","rate_dobutamine"]:
        if c in v.columns:
            v[c] = v[c].replace({"NULL": np.nan})
            v[c] = pd.to_numeric(v[c], errors="coerce")
        else:
            v[c] = 0.0
    if "bin" in v.columns:
        v["bin"] = pd.to_numeric(v["bin"], errors="coerce").astype("Int64")
    v["vaso_total"] = v[["rate_norepinephrine","rate_epinephrine","rate_dopamine","rate_dobutamine"]].fillna(0).sum(axis=1)
    return v[["icustay_id","bin","vaso_total"]]

def merge_vaso(df: pd.DataFrame, vaso: pd.DataFrame) -> pd.DataFrame:
    if not {"icustay_id","bin"} <= set(df.columns):
        raise ValueError("finalData must contain icustay_id and bin for merge.")
    out = df.merge(vaso, on=["icustay_id","bin"], how="left")
    out["vaso_total"] = out["vaso_total"].fillna(0.0)
    return out

def make_bins_from_dose(series: pd.Series, n_bins: int = 5) -> pd.Series:
    s = pd.to_numeric(series, errors="coerce").fillna(0.0)
    zero_mask = s <= 0
    nz = s[~zero_mask]
    if nz.empty:
        return pd.Series(np.zeros(len(s), dtype=int), index=s.index)
    try:
        qbins = pd.qcut(nz, 4, labels=False, duplicates="drop")  # 0..3
        out = pd.Series(np.zeros(len(s), dtype=int), index=s.index)
        out.loc[~zero_mask] = qbins.values + 1  # 1..4, 0 stays zero
    except ValueError:
        nz_rank = pd.cut(nz.rank(method="first"), 4, labels=False)
        out = pd.Series(np.zeros(len(s), dtype=int), index=s.index)
        out.loc[~zero_mask] = nz_rank.values + 1
    return out.astype(int)

def build_action_bins(df: pd.DataFrame) -> pd.DataFrame:
    if "4hourlyOutput" not in df.columns:
        raise ValueError("Expected 4hourlyOutput column for fluids proxy.")
    df["fluid_bin"] = make_bins_from_dose(df["4hourlyOutput"], n_bins=N_FLUID)
    df["vaso_bin"]  = make_bins_from_dose(df["vaso_total"],     n_bins=N_VASO)
    df["Action"]    = (df["vaso_bin"] * N_FLUID + df["fluid_bin"]).astype(int)
    return df

def compute_next_and_done(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["icustay_id","bin"]).reset_index(drop=True)
    df["next_same_patient"] = (df["icustay_id"] == df["icustay_id"].shift(-1)).astype(int)
    df["Done"] = (df["next_same_patient"] == 0).astype(int)
    return df

def compute_reward(df: pd.DataFrame) -> pd.DataFrame:
    if "SOFA" not in df.columns:
        raise ValueError("SOFA column required for reward.")
    df["SOFA_next"] = df.groupby("icustay_id")["SOFA"].shift(-1)
    df["r_inter"] = BETA_S * (df["SOFA"] - df["SOFA_next"])
    df["r_inter"] = df["r_inter"].fillna(0.0)

    if "90D_Mortality" not in df.columns:
        raise ValueError("90D_Mortality required for terminal reward.")
    mort = pd.to_numeric(df["90D_Mortality"], errors="coerce").fillna(0).clip(0,1)
    df["r_term"] = np.where(df["Done"] == 1, np.where(mort == 1, BETA_T, -BETA_T), 0.0)
    df["Reward"] = df["r_inter"] + df["r_term"]
    return df

def choose_feature_columns(df: pd.DataFrame, feature_set: int) -> List[str]:
    drop = set(DROP_ID_COLS + ["bin","SOFA","SOFA_next","r_inter","r_term","Action","fluid_bin","vaso_bin","Reward","Done"])
    drop.update(["90D_Mortality","Death"])
    if feature_set == 43:
        drop.update(["HCO3"])
        drop.update(RACE_OHE_COLS)
    Xcols = [c for c in df.columns if c not in drop]
    return Xcols

def zscore_fit(X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    mu = np.nanmean(X, axis=0)
    sd = np.nanstd(X, axis=0)
    sd = np.where(sd == 0, 1.0, sd)
    return mu, sd

def zscore_apply(X: np.ndarray, mu: np.ndarray, sd: np.ndarray) -> np.ndarray:
    return (X - mu) / sd

def split_by_patient(df: pd.DataFrame, test_frac=0.2, seed=42):
    rng = np.random.default_rng(seed)
    pats = df["icustay_id"].dropna().astype(int).unique()
    rng.shuffle(pats)
    n_test = int(round(len(pats) * test_frac))
    test_ids = set(pats[:n_test])
    train = df[~df["icustay_id"].isin(test_ids)].copy()
    test  = df[df["icustay_id"].isin(test_ids)].copy()
    return train, test

def build_mats(df: pd.DataFrame, Xcols: List[str]):
    X = df[Xcols].to_numpy(dtype=float)
    A = df["Action"].to_numpy(dtype=int)
    R = df["Reward"].to_numpy(dtype=float)
    D = df["Done"].to_numpy(dtype=int)
    Xnext = np.zeros_like(X)
    for k, col in enumerate(Xcols):
        col_next = df.groupby("icustay_id")[col].shift(-1)
        filled = col_next.fillna(df[col]).to_numpy(dtype=float)  # self-loop at terminal
        Xnext[:, k] = filled
    return X, A, R, Xnext, D

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--csv", required=True)
    ap.add_argument("--vaso", required=True)
    ap.add_argument("--feature-set", type=int, choices=[43,45], required=True)
    ap.add_argument("--seed", type=int, default=42)
    args = ap.parse_args()

    df = load_main(args.csv)
    vaso = load_vaso(args.vaso)
    df = merge_vaso(df, vaso)
    df = build_action_bins(df)
    df = compute_next_and_done(df)
    df = compute_reward(df)

    Xcols = choose_feature_columns(df, args.feature_set)
    train_df, test_df = split_by_patient(df, seed=args.seed)

    Xtr, Atr, Rtr, Xntr, Dtr = build_mats(train_df, Xcols)
    Xte, Ate, Rte, Xnte, Dte = build_mats(test_df, Xcols)  # Ate/Rte/Dte not required later

    mu, sd = zscore_fit(Xtr)
    Xtr  = zscore_apply(Xtr,  mu, sd)
    Xntr = zscore_apply(Xntr, mu, sd)
    Xte  = zscore_apply(Xte,  mu, sd)
    Xnte = zscore_apply(Xnte, mu, sd)

    survival_test = None
    if "90D_Mortality" in test_df.columns:
        survival_test = pd.to_numeric(test_df["90D_Mortality"], errors="coerce").fillna(0).clip(0,1).to_numpy(dtype=int)

    payload = dict(
        # Train
        Xtrain=Xtr, Actiontrain=Atr, Rewardtrain=Rtr, Xnext_train=Xntr, Done_train=Dtr,
        # Test
        Xtest=Xte, Survival_test=survival_test,
        # Meta
        feature_set=args.feature_set, feature_names=Xcols,
        norm_mean=mu, norm_std=sd,
        note="5x5 actions (fluid, vaso), SOFA-based reward + terminal +/-24 by 90D_Mortality"
    )

    outfile = f"requiredFile_{args.feature_set}.pkl"
    import pickle
    with open(outfile, "wb") as f:
        pickle.dump(payload, f)

    print(f"Saved {outfile} ({args.feature_set}-feature set, real vaso_total, 5x5 actions, reward).")
    print(f"- Xtrain: {Xtr.shape}, Xnext_train: {Xntr.shape}, Xtest: {Xte.shape}")
    print(f"- Features ({len(Xcols)}): {Xcols[:8]}{'...' if len(Xcols)>8 else ''}")

if __name__ == "__main__":
    main()
