In [None]:
# Step 1: Load core ICU info
import pandas as pd
import numpy as np

# Load ICU stays
icustays = pd.read_csv("icustays.csv.gz", compression="gzip", parse_dates=["intime", "outtime"])
icu_meta = icustays[["subject_id", "hadm_id", "stay_id", "intime", "first_careunit", "los"]]

# Load hospital admission info and assign mortality label
admissions = pd.read_csv("admissions.csv.gz", compression='gzip')
df_mortality = admissions.merge(icustays, on=["subject_id", "hadm_id"])
df_mortality["mortality_label"] = df_mortality["hospital_expire_flag"]
df_mortality = df_mortality[["subject_id", "hadm_id", "stay_id", "mortality_label"]]

# Step 2: Generate structured_note from icustays
icu_meta["structured_note"] = icu_meta.apply(
    lambda row: f"The patient was admitted to the {row['first_careunit']} and stayed for {round(row['los'], 1)} days.",
    axis=1
)

# Step 3: Load procedure events
procedures = pd.read_csv("procedureevents.csv.gz", compression="gzip", parse_dates=["starttime"])

# Step 4: Load d_items to decode procedure names
items = pd.read_csv("d_items.csv.gz", compression="gzip")
proc_map = items.set_index("itemid")["label"].to_dict()
procedures["proc_name"] = procedures["itemid"].map(proc_map)

# Step 5: Merge intime into procedures
proc_with_time = procedures.merge(icustays[["subject_id", "stay_id", "intime"]], on=["subject_id", "stay_id"], how="left")

# Keep only those within 24h of ICU admission
proc_with_time = proc_with_time[
    (proc_with_time["starttime"] >= proc_with_time["intime"])
    & (proc_with_time["starttime"] <= proc_with_time["intime"] + pd.Timedelta(hours=24))
]

# Step 6: Aggregate procedure descriptions
proc_summaries = proc_with_time.groupby(["subject_id", "hadm_id", "stay_id"])['proc_name'].apply(lambda x: list(set(x.dropna()))).reset_index()

# Turn procedure list into sentence
proc_summaries["procedure_note"] = proc_summaries["proc_name"].apply(
    lambda x: "The patient received the following procedures within the first 24 hours: " + ", ".join(x) + "." if x else "",
)

# Step 7: Merge all back together
icu_enriched = icu_meta.merge(proc_summaries[["subject_id", "hadm_id", "stay_id", "procedure_note"]],
                              on=["subject_id", "hadm_id", "stay_id"], how="left")

icu_enriched["procedure_note"] = icu_enriched["procedure_note"].fillna("")

# Step 8: Add discharge notes
print("Loading discharge notes...")
discharge = pd.read_csv("discharge.csv.gz", compression='gzip')
discharge_latest = discharge.sort_values("charttime").drop_duplicates("hadm_id", keep="last")
discharge_latest["text_note"] = discharge_latest["text"].str.replace(r"\[\*\*.*?\*\*\]", "", regex=True)
df_mortality = df_mortality.merge(discharge_latest[["hadm_id", "text_note"]], on="hadm_id", how="inner")

# Step 9: Add lab-based vitals from labevents
print("Loading lab events for vitals...")
labs = pd.read_csv("labevents.csv.gz", compression="gzip", parse_dates=["charttime"])

# Example lab-based vital signs and proxies
vital_lab_items = {
    50912: "glucose",
    50983: "sodium",
    50822: "potassium",
    51006: "bun",
    50971: "creatinine",
    50868: "calcium",
    50862: "chloride",
    50902: "magnesium",
    50809: "albumin",
    50820: "bilirubin_total"
}

labs = labs[labs["itemid"].isin(vital_lab_items.keys())]
labs["vital_label"] = labs["itemid"].map(vital_lab_items)
# Clean value column to keep only numeric values
labs["value"] = pd.to_numeric(labs["value"], errors="coerce")
labs = labs.dropna(subset=["value"])

# Now safely aggregate
lab_summary = labs.groupby(["subject_id", "hadm_id", "itemid"]).agg({"value": "median"}).reset_index()

lab_summary["vital_label"] = lab_summary["itemid"].map(vital_lab_items)

lab_pivot = lab_summary.pivot_table(index=["subject_id", "hadm_id"], columns="vital_label", values="value").reset_index()
lab_pivot.columns.name = None

# Step 10: Add CXR image path
print("Loading CXR metadata...")
study_df = pd.read_csv("cxr-study-list.csv.gz", compression="gzip")
record_df = pd.read_csv("cxr-record-list.csv.gz", compression="gzip")
cxr_meta = study_df.merge(record_df, on=["subject_id", "study_id"], how="inner")
cxr_meta["image_path"] = cxr_meta["dicom_id"].apply(
    lambda x: f"mimic-cxr/files/p{str(x)[:2]}/p{str(x)}/s{x}/{x}.jpg.gz"
)
cxr_meta_dedup = cxr_meta.sort_values("study_id").drop_duplicates("subject_id", keep="first")

# Step 11: Merge everything into one final dataset
df_full = df_mortality.merge(icu_enriched, on=["subject_id", "hadm_id", "stay_id"], how="left")
df_full = df_full.merge(lab_pivot, on=["subject_id", "hadm_id"], how="left")
df_full = df_full.merge(cxr_meta_dedup[["subject_id", "image_path"]], on="subject_id", how="inner")
df_full["combined_note"] = df_full["procedure_note"].fillna("") + " " + df_full["structured_note"].fillna("") + " " + df_full["text_note"].fillna("")

# Save final result
df_full.to_csv("final_multimodal_dataset.csv", index=False)

# Output summary
print("\nFinal dataset columns:")
print(list(df_full.columns))


In [None]:
# Drop any column with too much missing value
clean_dataset = filtered_dataset.drop(columns=['potassium', 'albumin', 'chloride', 'bilirubin_total'])

# Drop all rows with any missing values
clean_dataset = clean_dataset.dropna()

# Optional: check new size
print(f"Remaining rows after dropping missing values: {len(clean_dataset)}")


## add image information

In [None]:
cxr_records = pd.read_csv("data_process/cxr-record-list.csv.gz", compression="gzip")

In [None]:
# Deduplicate to get one dicom_id + study_id per subject
cxr_records_dedup = cxr_records.sort_values("study_id").drop_duplicates("subject_id", keep="first")

# Merge to bring in study_id and dicom_id
df_merged = clean_dataset.merge(cxr_records_dedup[["subject_id", "study_id", "dicom_id"]], on="subject_id", how="inner")
df_merged["image_path"] = df_merged.apply(
    lambda row: f"files/p{str(row['subject_id']).zfill(8)[:2]}/p{str(row['subject_id']).zfill(8)}/s{int(row['study_id'])}/{row['dicom_id']}.dcm",
    axis=1
)

In [None]:
df_merged
print(df_merged.columns.tolist())

In [None]:
df_merged.to_csv("final_image.csv", index=False)

In [None]:
final = df_merged.drop(columns=['study_id', 'dicom_id', 'subject_id', 'hadm_id', 'stay_id'])

In [None]:
final.to_csv("final.csv", index=False)


## process image features

In [None]:
import torch
from tqdm.auto import tqdm
import pandas as pd
from datasets import load_dataset
import torchxrayvision as xrv
import torchvision.transforms as T
import os

# 1. Load 'final_image.csv', skipping rows with formatting issues
df_list = pd.read_csv(
    "final_image.csv",
    dtype=str,
    engine="python",       # Use more forgiving Python parser
    on_bad_lines="skip"    # Skip problematic lines
)
print(f"Loaded {len(df_list)} valid rows.")
print("Columns:", df_list.columns.tolist())

# Analyze image path column
total_rows = len(df_list)
total_paths = len(df_list["image_path"])
unique_paths = len(set(df_list["image_path"]))
null_paths = df_list["image_path"].isna().sum()
empty_paths = (df_list["image_path"] == "").sum()

print(f"Total rows: {total_rows}")
print(f"Total paths: {total_paths}")
print(f"Unique paths: {unique_paths}")
print(f"Null paths: {null_paths}")
print(f"Empty paths: {empty_paths}")

# Create a mapping from path to original row indices
path_to_indices = {}
for idx, path in enumerate(df_list["image_path"]):
    if path not in path_to_indices:
        path_to_indices[path] = []
    path_to_indices[path].append(idx)

# Only process unique paths but retain mapping back to original rows
unique_paths = list(path_to_indices.keys())
print(f"Total unique paths to process: {len(unique_paths)}")

# 2. Set device and load pretrained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = xrv.models.DenseNet(weights="densenet121-res224-mimic_nb").to(device).eval()

# 3. Image preprocessing pipeline (PIL → Tensor → Normalize → Grayscale)
transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# 4. Stream MIMIC-CXR image data from Hugging Face
ds_stream = load_dataset(
    "StanfordAIMI/mimic-cxr-images-512",
    split="train",
    streaming=True
)

# 5. Store extracted features using image path as key
path_to_features = {}

# 6. Stream and process images
for ex in tqdm(ds_stream, total=377_110, desc="Scanning shards"):
    hf_path = ex["path"]
    hf_path_without_ext = os.path.splitext(hf_path)[0]

    # Find all matching paths (ignore extension differences)
    matching_paths = []
    for path in list(unique_paths):  # Copy to safely remove items during iteration
        path_without_ext = os.path.splitext(path)[0]
        if hf_path_without_ext == path_without_ext:
            matching_paths.append(path)

    # Process each matching path
    for path in matching_paths:
        # Image preprocessing + feature extraction
        img3 = transform(ex["image"])
        img1 = img3.mean(0, keepdim=True).unsqueeze(0).to(device)

        with torch.no_grad():
            feat = model.features(img1)
            feat = feat.mean([2, 3]).squeeze(0)  # [1024]

        # Save features
        path_to_features[path] = feat.cpu().numpy().tolist()

        # Remove processed path from the list
        if path in unique_paths:
            unique_paths.remove(path)

        # Periodically report progress
        if len(path_to_features) % 10 == 0:
            print(f"Processed {len(path_to_features)}/{total_paths} images. {len(unique_paths)} remaining.")

    # Exit early if all paths are processed
    if not unique_paths:
        break

print(f"Extracted features for {len(path_to_features)} images.")

# 7. Map features back to all original rows
records = []
for path, feat in path_to_features.items():
    # Parse path to extract subject and study IDs
    parts = path.split("/")
    subject_id = parts[1][1:] if parts[1].startswith('p') else parts[1]
    study_id = parts[2][1:] if parts[2].startswith('s') else parts[2]

    # For each original row associated with this path, create a full record
    for idx in path_to_indices[path]:
        row = df_list.iloc[idx]
        record = {
            "subject_id": subject_id,
            "study_id": study_id,
            "image_path": path,
            "feat_1024": feat,
        }

        # Add all other columns from the original row
        for col in df_list.columns:
            if col != "image_path":  # Already added
                record[col] = row[col]

        records.append(record)

print(f"Created {len(records)} records with features.")

# 8. Save all records
df_feats = pd.DataFrame(records)
df_feats.to_pickle("final_image_feats.pkl")
print(f"Saved {len(df_feats)} records to final_image_feats.pkl")
