In [1]:
# analyze_dataset.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

CSV = "finalData.csv"
RANDOM_STATE = 42
NBINS = 25  # 5 fluids x 5 vaso

# columns to drop
DROP_COLS = [
    "Unnamed: 0", "hadm_id", "icustay_id", "subject_id",
    "HCO3",  
    "race_asian", "race_black", "race_latino", "race_white", "race_other"
]

# load
df = pd.read_csv(CSV)

# keep 90D_Mortality as label (1=death, 0=alive)
assert "90D_Mortality" in df.columns, "90D_Mortality missing"

# drop columns if present
for c in DROP_COLS:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

# fill simple NaNs with column means for features
feature_cols = [c for c in df.columns if c not in ["90D_Mortality", "Death"]]
df[feature_cols] = df[feature_cols].astype(float)
df[feature_cols] = df[feature_cols].fillna(df[feature_cols].mean())

# make fluid bins from TotalInput (if missing, fallback to zeros)
if "TotalInput" in df.columns:
    # quantile bins 0..4 (per entire dataset)
    fluid_bin = pd.qcut(df["TotalInput"].rank(method="first"), 5, labels=False)
else:
    fluid_bin = pd.Series(np.zeros(len(df), dtype=int))

# dataset has no vaso input; set historical vaso_bin=0
vaso_bin = pd.Series(np.zeros(len(df), dtype=int))

# combine to 25-action index a = fluid + 5*vaso
Action = (fluid_bin.values + 5 * vaso_bin.values).astype(int)

# reward proxy: +24 alive, -24 death (paper’s terminal reward scale)
y = df["90D_Mortality"].astype(int).values
reward = np.where(y == 0, 24.0, -24.0)

# features X (remove label)
X = df.drop(columns=["90D_Mortality"])

# next-state (very simple: shift by 1 as a placeholder)
Xnext = X.shift(-1).fillna(method="ffill")

# split
X_train, X_tmp, y_train, y_tmp, A_train, A_tmp, Xnext_train, Xnext_tmp = train_test_split(
    X, y, Action, Xnext, test_size=0.3, random_state=RANDOM_STATE, stratify=y
)
X_val, X_test, y_val, y_test, A_val, A_test, Xnext_val, Xnext_test = train_test_split(
    X_tmp, y_tmp, A_tmp, Xnext_tmp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_tmp
)

# normalize features (fit on train)
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val_scaled   = pd.DataFrame(scaler.transform(X_val),   columns=X_val.columns,   index=X_val.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test),  columns=X_test.columns,  index=X_test.index)

Xnext_train_scaled = pd.DataFrame(scaler.transform(Xnext_train), columns=Xnext_train.columns, index=Xnext_train.index)
Xnext_val_scaled   = pd.DataFrame(scaler.transform(Xnext_val),   columns=Xnext_val.columns,   index=Xnext_val.index)
Xnext_test_scaled  = pd.DataFrame(scaler.transform(Xnext_test),  columns=Xnext_test.columns,  index=Xnext_test.index)

# pack
D = {
    "nbins": NBINS,
    "X_train": X_train_scaled, "X_val": X_val_scaled, "X_test": X_test_scaled,
    "Xnext_train": Xnext_train_scaled, "Xnext_val": Xnext_val_scaled, "Xnext_test": Xnext_test_scaled,
    "y_train": y_train, "y_val": y_val, "y_test": y_test,
    "Action_train": A_train, "Action_val": A_val, "Action_test": A_test,
}

with open("requiredFile.pkl", "wb") as f:
    pickle.dump(D, f)

print("Saved requiredFile.pkl with 25-action space (vaso_bin=0 historically) and normalized splits.")


  Xnext = X.shift(-1).fillna(method="ffill")


Saved requiredFile.pkl with 25-action space (vaso_bin=0 historically) and normalized splits.


# My approach

In [64]:
vaso = pd.read_csv('vaso.csv')

In [65]:
vaso['avg_vaso_rate'] = vaso[['rate_norepinephrine', 'rate_epinephrine', 'rate_dopamine', 'rate_dobutamine']].max(axis=1)
vaso = vaso[['icustay_id', 'bin', 'avg_vaso_rate']]

In [66]:
df = pd.read_csv('finalData.csv')

In [67]:
df.columns

Index(['Unnamed: 0', 'hadm_id', 'icustay_id', 'subject_id', 'Age', 'Death',
       'Gender', 'race_asian', 'race_black', 'race_latino', 'race_white',
       'race_other', '90D_Mortality', 'Weight', 'bin', 'PH', 'PaO2', 'PaCO2',
       'PaO2/FiO2', 'ArterialBE', 'HCO3', 'FiO2', 'HGB', 'Chloride', 'Calcium',
       'Magnesium', 'SGPT', 'SGOT', 'Temperature', 'HR', 'RR', 'SBP', 'DBP',
       'MBP', 'ShockIndex', 'SpO2', 'AL', 'BUN', 'Creatinine', 'Platelet',
       'WBC', 'Potassium', 'Sodium', 'Glucose', 'PTT', 'PT', 'INR', 'TB', 'CB',
       'TotalInput', 'TotalOutput', '4hourlyOutput', 'SOFA', 'SIRS', 'GCS'],
      dtype='object')

In [68]:
df = df.merge(vaso, on=['icustay_id', 'bin'], how='left')

In [69]:
df['avg_vaso_rate'] = df['avg_vaso_rate'].fillna(0)

In [70]:
def discretize_iv(x):
    if x == 0: return 0
    elif x <= 50: return 1
    elif x <= 180: return 2
    elif x <= 530: return 3
    else: return 4

def discretize_vaso(x):
    if x == 0: return 0
    elif x <= 0.08: return 1
    elif x <= 0.22: return 2
    elif x <= 0.45: return 3
    else: return 4

df['iv_bin'] = df['TotalInput'].apply(discretize_iv)
df['vaso_bin'] = df['avg_vaso_rate'].apply(discretize_vaso)
df['Action'] = df['iv_bin'] * 5 + df['vaso_bin']

In [71]:
# sanity check
df = df.groupby('icustay_id').filter(lambda x: len(x) == 20)

In [72]:
df.columns

Index(['Unnamed: 0', 'hadm_id', 'icustay_id', 'subject_id', 'Age', 'Death',
       'Gender', 'race_asian', 'race_black', 'race_latino', 'race_white',
       'race_other', '90D_Mortality', 'Weight', 'bin', 'PH', 'PaO2', 'PaCO2',
       'PaO2/FiO2', 'ArterialBE', 'HCO3', 'FiO2', 'HGB', 'Chloride', 'Calcium',
       'Magnesium', 'SGPT', 'SGOT', 'Temperature', 'HR', 'RR', 'SBP', 'DBP',
       'MBP', 'ShockIndex', 'SpO2', 'AL', 'BUN', 'Creatinine', 'Platelet',
       'WBC', 'Potassium', 'Sodium', 'Glucose', 'PTT', 'PT', 'INR', 'TB', 'CB',
       'TotalInput', 'TotalOutput', '4hourlyOutput', 'SOFA', 'SIRS', 'GCS',
       'avg_vaso_rate', 'iv_bin', 'vaso_bin', 'Action'],
      dtype='object')

In [73]:
state_cols = [
    'Age', 'Gender', 'race_asian', 'race_black', 'race_latino', 'race_white',
    'race_other', 'Weight', 'PH', 'PaO2', 'PaCO2',
    'PaO2/FiO2', 'ArterialBE', 'HCO3', 'FiO2', 'HGB', 'Chloride', 'Calcium',
    'Magnesium', 'SGPT', 'SGOT', 'Temperature', 'HR', 'RR', 'SBP', 'DBP',
    'MBP', 'ShockIndex', 'SpO2', 'AL', 'BUN', 'Creatinine', 'Platelet',
    'WBC', 'Potassium', 'Sodium', 'Glucose', 'PTT', 'PT', 'INR', 'TB', 'CB',
    'TotalInput', 'TotalOutput', '4hourlyOutput', 'SOFA', 'SIRS', 'GCS'
]

In [74]:
groups = df.groupby('icustay_id')
patient_ids = {key: idx for idx, key in enumerate(groups.groups.keys())}

In [75]:
X_list = []
Xnext_list = []
Action_list = []
ActionNext_list = []
Reward_list = []
Done_list = []
Bloc_list = []
SOFA_list = []

for icustay_id, group in groups:
    group = group.sort_values('bin')
    if len(group) != 20:
        continue
    
    states = group[state_cols].values
    sofas = group['SOFA'].values
    mortality = group['90D_Mortality'].iloc[0]
    actions = group['Action'].values
    
    bloc_id = patient_ids[icustay_id]
    
    for t in range(20):
        s = states[t]
        a = actions[t]
        sofa_t = sofas[t]
        
        if t < 19:
            r = 0.6 * (sofa_t - sofas[t + 1])
            d = 0
            s_next = states[t + 1]
            a_next = actions[t + 1]
        else:
            r = -24 if mortality == 1 else 24
            d = 1
            s_next = np.zeros_like(s)
            a_next = 0
        
        X_list.append(s)
        Xnext_list.append(s_next)
        Action_list.append(a)
        ActionNext_list.append(a_next)
        Reward_list.append(r)
        Done_list.append(d)
        Bloc_list.append(bloc_id)
        SOFA_list.append(sofa_t)

In [76]:
MIMICtable = {
    'X': np.array(X_list),
    'Xnext': np.array(Xnext_list),
    'Action': np.array(Action_list),
    'ActionNext': np.array(ActionNext_list),
    'Reward': np.array(Reward_list),
    'Done': np.array(Done_list),
    'Bloc': np.array(Bloc_list),
    'SOFA': np.array(SOFA_list)
}

with open('requiredFile.pkl', 'wb') as f:
    pickle.dump(MIMICtable, f)
print('requiredFile.pkl created.')

requiredFile.pkl created.
