In [None]:
import os
import h5py
import pandas as pd

# prefix = "/path/to"
prefix = "/opt/gpudata/rrg-data-2/baselines"

In [None]:
findings = pd.read_csv(os.path.join(prefix, "findings.csv"))[["study_id", "dicom_id", "findings"]].rename(columns={"findings": "actual_text"})
impression = pd.read_csv(os.path.join(prefix, "impression.csv"))[["study_id", "dicom_id", "impression"]].rename(columns={"impression": "actual_text"})

In [None]:
assert not findings[["study_id", "dicom_id"]].duplicated().any()
assert not impression[["study_id", "dicom_id"]].duplicated().any()

In [None]:
with h5py.File(os.path.join(prefix, "cxrrepair/data/test/cxr.h5"), "r") as h5:
    dicom_ids = h5["did"][:].astype(str).tolist()
    study_ids = h5["sid"][:]

In [None]:
cxrrepair = pd.read_csv(os.path.join(prefix, "cxrrepair/outputs/generations.csv"))
# study_id,generated_text

cxrredone = pd.read_csv(os.path.join(prefix, "cxrredone/outputs/generations.csv"))
# study_id,Report Impression

xrem = pd.read_csv(os.path.join(prefix, "xrem/outputs/generations.csv"))
# study_id,Report Impression,filtered

cxrmate = pd.read_csv(os.path.join(prefix, "cxrmate/outputs/generations.csv"))
# study_id,dicom_id,findings,impression

chexagent = pd.read_csv(os.path.join(prefix, "chexagent/outputs/generations.csv.OLD"))
# study_id,findings,impression

rgrg = pd.read_csv(os.path.join(prefix, "rgrg/outputs/generations.csv"))
# study_id,actual_text,generated_text

In [None]:
assert (cxrrepair["study_id"] == study_ids).all()
assert (cxrredone["study_id"] == study_ids).all()
assert (xrem["study_id"] == study_ids).all()

In [None]:
def select_view(df, reference):
    df["dicom_id"] = dicom_ids
    df = df.merge(reference[["study_id", "dicom_id"]], on=["study_id", "dicom_id"])
    return df

cxrrepair = select_view(cxrrepair, impression)
cxrredone = select_view(cxrredone, impression)
xrem = select_view(xrem, impression)

In [None]:
assert not cxrrepair["study_id"].duplicated().any()
assert not cxrredone["study_id"].duplicated().any()
assert not xrem["study_id"].duplicated().any()
assert not cxrmate["study_id"].duplicated().any()
assert not chexagent["study_id"].duplicated().any()
assert not rgrg["study_id"].duplicated().any()

In [None]:
# cxrrepair already has generated_text col
cxrredone = cxrredone.rename(columns={"Report Impression": "generated_text"})
xrem = xrem.rename(columns={"filtered": "generated_text"})
cxrmate_impression = cxrmate.rename(columns={"impression": "generated_text"})
chexagent_impression = chexagent.rename(columns={"impression": "generated_text"})

cxrmate_findings = cxrmate.rename(columns={"findings": "generated_text"})
chexagent_findings = chexagent.rename(columns={"findings": "generated_text"})
# rgrg already has generated_text col

In [None]:
cxrrepair = cxrrepair.merge(impression[["study_id", "actual_text"]], on="study_id")
cxrredone = cxrredone.merge(impression[["study_id", "actual_text"]], on="study_id")
xrem = xrem.merge(impression[["study_id", "actual_text"]], on="study_id")
cxrmate_impression = cxrmate_impression.merge(impression[["study_id", "actual_text"]], on="study_id")
chexagent_impression = chexagent_impression.merge(impression[["study_id", "actual_text"]], on="study_id")

cxrmate_findings = cxrmate_findings.merge(findings[["study_id", "actual_text"]], on="study_id")
chexagent_findings = chexagent_findings.merge(findings[["study_id", "actual_text"]], on="study_id")
# rgrg already has actual_text col
assert rgrg.merge(findings[["study_id"]], on=["study_id"]).equals(rgrg)

In [None]:
assert not cxrrepair["study_id"].duplicated().any()
assert not cxrredone["study_id"].duplicated().any()
assert not xrem["study_id"].duplicated().any()
assert not cxrmate_impression["study_id"].duplicated().any()
assert not chexagent_impression["study_id"].duplicated().any()

assert not cxrmate_findings["study_id"].duplicated().any()
assert not chexagent_findings["study_id"].duplicated().any()
assert not rgrg["study_id"].duplicated().any()

In [None]:
cols = ["study_id", "actual_text", "generated_text"]

cxrmate_joint = cxrmate_findings[cols].merge(cxrmate_impression[cols], on="study_id", suffixes=("_findings", "_impression"))
cxrmate_joint["actual_text"] = "Findings: " + cxrmate_joint["actual_text_findings"] + "\nImpression: " + cxrmate_joint["actual_text_impression"]
cxrmate_joint["generated_text"] = "Findings: " + cxrmate_joint["generated_text_findings"] + "\nImpression: " + cxrmate_joint["generated_text_impression"]

chexagent_joint = chexagent_findings[cols].merge(chexagent_impression[cols], on="study_id", suffixes=("_findings", "_impression"))
chexagent_joint["actual_text"] = "Findings: " + chexagent_joint["actual_text_findings"] + "\nImpression: " + chexagent_joint["actual_text_impression"]
chexagent_joint["generated_text"] = "Findings: " + chexagent_joint["generated_text_findings"] + "\nImpression: " + chexagent_joint["generated_text_impression"]

cxrrepair[cols].to_csv(os.path.join(prefix, "cxrrepair_impression.csv"), index=False)
cxrredone[cols].to_csv(os.path.join(prefix, "cxrredone_impression.csv"), index=False)
xrem[cols].to_csv(os.path.join(prefix, "xrem_impression.csv"), index=False)
cxrmate_impression[cols].to_csv(os.path.join(prefix, "cxrmate_impression.csv"), index=False)
chexagent_impression[cols].to_csv(os.path.join(prefix, "chexagent_impression.csv"), index=False)

cxrmate_findings[cols].to_csv(os.path.join(prefix, "cxrmate_findings.csv"), index=False)
chexagent_findings[cols].to_csv(os.path.join(prefix, "chexagent_findings.csv"), index=False)
rgrg[cols].to_csv(os.path.join(prefix, "rgrg_findings.csv"), index=False)

cxrmate_joint[cols].to_csv(os.path.join(prefix, "cxrmate_both.csv"), index=False)
chexagent_joint[cols].to_csv(os.path.join(prefix, "chexagent_both.csv"), index=False)