In [None]:
# analyze_dataset.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

CSV = "finalData.csv"
RANDOM_STATE = 42
NBINS = 25  # 5 fluids x 5 vaso

# columns to drop
DROP_COLS = [
    "Unnamed: 0", "hadm_id", "icustay_id", "subject_id",
    "HCO3",  # teammate requested
    "race_asian", "race_black", "race_latino", "race_white", "race_other"
]

# load
df = pd.read_csv(CSV)

# keep 90D_Mortality as label (1=death, 0=alive)
assert "90D_Mortality" in df.columns, "90D_Mortality missing"

# drop columns if present
for c in DROP_COLS:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

# fill simple NaNs with column means for features
feature_cols = [c for c in df.columns if c not in ["90D_Mortality", "Death"]]
df[feature_cols] = df[feature_cols].astype(float)
df[feature_cols] = df[feature_cols].fillna(df[feature_cols].mean())

# make fluid bins from TotalInput (if missing, fallback to zeros)
if "TotalInput" in df.columns:
    # quantile bins 0..4 (per entire dataset)
    fluid_bin = pd.qcut(df["TotalInput"].rank(method="first"), 5, labels=False)
else:
    fluid_bin = pd.Series(np.zeros(len(df), dtype=int))

# dataset has no vaso input; set historical vaso_bin=0
vaso_bin = pd.Series(np.zeros(len(df), dtype=int))

# combine to 25-action index a = fluid + 5*vaso
Action = (fluid_bin.values + 5 * vaso_bin.values).astype(int)

# reward proxy: +24 alive, -24 death (paper’s terminal reward scale)
y = df["90D_Mortality"].astype(int).values
reward = np.where(y == 0, 24.0, -24.0)

# features X (remove label)
X = df.drop(columns=["90D_Mortality"])

# next-state (very simple: shift by 1 as a placeholder)
Xnext = X.shift(-1).fillna(method="ffill")

# split
X_train, X_tmp, y_train, y_tmp, A_train, A_tmp, Xnext_train, Xnext_tmp = train_test_split(
    X, y, Action, Xnext, test_size=0.3, random_state=RANDOM_STATE, stratify=y
)
X_val, X_test, y_val, y_test, A_val, A_test, Xnext_val, Xnext_test = train_test_split(
    X_tmp, y_tmp, A_tmp, Xnext_tmp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_tmp
)

# normalize features (fit on train)
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val_scaled   = pd.DataFrame(scaler.transform(X_val),   columns=X_val.columns,   index=X_val.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test),  columns=X_test.columns,  index=X_test.index)

Xnext_train_scaled = pd.DataFrame(scaler.transform(Xnext_train), columns=Xnext_train.columns, index=Xnext_train.index)
Xnext_val_scaled   = pd.DataFrame(scaler.transform(Xnext_val),   columns=Xnext_val.columns,   index=Xnext_val.index)
Xnext_test_scaled  = pd.DataFrame(scaler.transform(Xnext_test),  columns=Xnext_test.columns,  index=Xnext_test.index)

# pack
D = {
    "nbins": NBINS,
    "X_train": X_train_scaled, "X_val": X_val_scaled, "X_test": X_test_scaled,
    "Xnext_train": Xnext_train_scaled, "Xnext_val": Xnext_val_scaled, "Xnext_test": Xnext_test_scaled,
    "y_train": y_train, "y_val": y_val, "y_test": y_test,
    "Action_train": A_train, "Action_val": A_val, "Action_test": A_test,
}

with open("requiredFile.pkl", "wb") as f:
    pickle.dump(D, f)

print("Saved requiredFile.pkl with 25-action space (vaso_bin=0 historically) and normalized splits.")
