Need globally unique study and dicom IDs, construct by concatenating patient/study/dicom IDs together

In [None]:
import pandas as pd
import h5py
from tqdm import tqdm

In [None]:
with h5py.File("/opt/gpudata/labrag-chexpertplus/biovilt-features.h5", "a") as h5:
    for k in ["img_embed", "img_proj"]:
        group_k = h5[k]
        for pid in tqdm(group_k.keys()):
            group_p = group_k[pid]
            for sid in group_p.keys():
                if pid in sid:
                    continue
                group_s = group_p[sid]
                for did in group_s.keys():
                    if sid in did:
                        continue
                    group_s.move(did, f"{pid}_{sid}_{did}")
                group_p.move(sid, f"{pid}_{sid}")


In [None]:
clinical = pd.read_csv("/opt/gpudata/chexpertplus/df_chexpert_plus_240401.csv")
labels = pd.read_json("/opt/gpudata/chexpertplus/report_fixed.json", lines=True) # Use labels derived from study
label_cols = labels.columns.to_list()[1:]

In [None]:
for df in [clinical, labels]:
    ids = df["path_to_image"].str.split("/")
    df["subject_id"] = ids.str[1]
    df["study_id"] = ids.str[1] + "_" + ids.str[2]
    df["dicom_id"] = ids.str[1] + "_" + ids.str[2] + "_" + ids.str[3].str.split(".").str[0]

In [None]:
# unclear why patient32368 image couldn't be processed, omit
clinical = clinical[clinical["subject_id"] != "patient32368"].reset_index(drop=True)
labels = labels[labels["subject_id"] != "patient32368"].reset_index(drop=True)

In [None]:
# derive new validation split, use provided validation split as test split
train_subjects = clinical.loc[clinical["split"] == "train", "subject_id"].drop_duplicates()
new_validate = set(train_subjects.sample(n=400, replace=False, random_state=42))
clinical["split"] = clinical["split"].replace({"valid": "test"})
clinical.loc[clinical["subject_id"].isin(new_validate), "split"] = "validate"

In [None]:
split_df = clinical[["subject_id", "study_id", "dicom_id", "split"]]
metadata_df = clinical[["subject_id", "study_id", "dicom_id", "ap_pa"]]
label_df = labels[["subject_id", "study_id", "dicom_id"] + label_cols]
report_df = clinical[["subject_id", "study_id", "dicom_id", "section_findings", "section_impression"]]

In [None]:
label_df = label_df.sort_values(["subject_id", "study_id", "dicom_id"]).drop_duplicates(["subject_id", "study_id"], keep="first").reset_index(drop=True)
report_df = report_df.sort_values(["subject_id", "study_id", "dicom_id"]).drop_duplicates(["subject_id", "study_id"], keep="first").reset_index(drop=True)

In [None]:
split_df.to_csv("/opt/gpudata/labrag-chexpertplus/split.csv", index=False)
metadata_df.to_csv("/opt/gpudata/labrag-chexpertplus/metadata.csv", index=False)
label_df.to_csv("/opt/gpudata/labrag-chexpertplus/label.csv", index=False)
report_df.to_csv("/opt/gpudata/labrag-chexpertplus/report.csv", index=False)

In [None]:
metadata_df["ap_pa"].value_counts()