In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load cleaned data (43 features) created in the notebook
df = pd.read_csv('./finalData.csv')

#print(df.shape)
#print(df.columns)
cols = ['hadm_id', 'icustay_id', 'subject_id', 'race_asian', 'race_black', 'race_latino', 'race_white', 'race_other', 'HCO3', 'bin', 'Unnamed: 0']
df.drop(columns=cols, inplace=True)

#print(df.shape)
#print(df.columns)

# Separate features and label
X = df.drop(columns=['90D_Mortality'])
y = df["90D_Mortality"]

# Simulated action from fluids (discrete bins)
df["fluid_bin"] = pd.qcut(df["TotalInput"], q=5, labels=False, duplicates='drop')
action = df["fluid_bin"].fillna(0).astype(int)

# Fill any missing values in features
X = X.fillna(X.mean())

# Placeholder next-state
Xnext = X.copy()

# Normalize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
Xnext = pd.DataFrame(scaler.transform(Xnext), columns=X.columns)

# Train/val/test split (80/10/10)
X_temp, X_test, y_temp, y_test, Xnext_temp, Xnext_test, action_temp, action_test = train_test_split(
    X, y, Xnext, action, test_size=0.1, random_state=42
)
X_train, X_val, y_train, y_val, Xnext_train, Xnext_val, action_train, action_val = train_test_split(
    X_temp, y_temp, Xnext_temp, action_temp, test_size=0.1111, random_state=42
)

# Save packaged dataset
final_data = {
    "X_train": X_train, "y_train": y_train, "Xnext_train": Xnext_train, "Action_train": action_train,
    "X_val": X_val, "y_val": y_val, "Xnext_val": Xnext_val, "Action_val": action_val,
    "X_test": X_test, "y_test": y_test, "Xnext_test": Xnext_test, "Action_test": action_test
}
pd.to_pickle(final_data, "requiredFile.pkl")
print("Saved requiredFile.pkl with missing values filled and normalized.")


Saved requiredFile.pkl with missing values filled and normalized.
