In [None]:
import sys
from os import getcwd
from os.path import join
import pandas as pd
sys.path.insert(0, join(getcwd(), "../module_code"))

from data.load import load_data, load_outcomes
from data.utils import read_files_and_combine
from cli_utils import load_cli_args, init_cli_args

sys.argv = [sys.argv[0]]
load_cli_args("../options.yml")
args = init_cli_args()
unique = {}

# Providers

In [None]:
provider = "Providers.txt"
crrt_df = read_files_and_combine([provider], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([provider], args.ucla_control_data_dir)

In [None]:
print(crrt_df.columns)
print(ctrl_df.columns)

# Demographics

In [None]:
demo = "Patient_Demographics.txt"
crrt_df = read_files_and_combine([demo], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([demo], args.ucla_control_data_dir)

In [None]:
unique["demographics"] = {
    "crrt": crrt_df["IP_PATIENT_ID"].nunique(),
    "ctrl": ctrl_df["IP_PATIENT_ID"].nunique()
}
print(unique)

In [None]:
crrt_df.columns

In [None]:
ctrl_df.columns

In [None]:
import matplotlib.pyplot as plt

def plot_demo_breakdown(dfs: list[pd.DataFrame], ctn: list[str], drop_cols: list[str], ax: plt.Axes):
    for row, df in enumerate(dfs):
        for i, colname in enumerate(df.drop(drop_cols, axis=1).columns):
            if colname not in ctn:
                data = df[colname].value_counts()
                data.plot.bar(ax=ax[row, i], title=colname, rot=45, layout="tight")
            else:
                data = df[colname]
                data.plot.hist(ax=ax[row, i], title=colname)

fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20,10))
plot_demo_breakdown(
    [
        crrt_df,
        ctrl_df.rename({"IP_CURRENT_PCP_ID": "PCP_IP_PROVIDER_ID"}, axis=1)
    ],
    ["AGE"],
    ["IP_PATIENT_ID", "PCP_IP_PROVIDER_ID"],
    axs
)
for ax, row in zip(axs[:,0], ["CRRT", "Control"]):
    ax.set_ylabel(row, rotation=0, size="large")
fig.tight_layout()
fig.show()

In [None]:
def differences(col: str):
    return set(crrt_df[col].value_counts().index).symmetric_difference(set(ctrl_df[col].value_counts().index))

In [None]:
print(differences("RACE"))
print(differences("ETHNICITY"))

# Vitals
The vital sign names seems to be mismatching between the crrt patients and controls at UCLA.

Controls have `WEIGHT/SCALE` which is messing with splitting `SBP/DBP`.

In [None]:
vitals = "Flowsheet_Vitals.txt"
crrt_df = read_files_and_combine([vitals], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([vitals], args.ucla_control_data_dir)

unique["vitals"] = {
    "crrt": crrt_df["IP_PATIENT_ID"].nunique(),
    "ctrl": ctrl_df["IP_PATIENT_ID"].nunique()
}
print(unique)

In [None]:
crrt_df["VITAL_SIGN_TYPE"].unique()

In [None]:
ctrl_df["VITAL_SIGN_TYPE"].unique()

They all map except for O2 Device, which CRRT people have while controls do not.

In [None]:
crrt_df[crrt_df["VITAL_SIGN_TYPE"] == "SpO2"]["VITAL_SIGN_VALUE"]

In [None]:
ctrl_df[ctrl_df["vital_sign_type"] == "PULSE OXIMETRY"]["vital_sign_value"]

In [None]:
crrt_df[crrt_df["VITAL_SIGN_TYPE"] == "O2 Device"]["VITAL_SIGN_VALUE"]

# Diagnoses

In [None]:
pt = "1E6D759D88A19B0CFFE1F2EF2B4238CD"
fname = "Encounter_Diagnoses.txt"
crrt_df = read_files_and_combine([fname], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([fname], args.ucla_control_data_dir)

In [None]:
unique["diagnoses"] = {
    "crrt": crrt_df["IP_PATIENT_ID"].nunique(),
    "ctrl": ctrl_df["IP_PATIENT_ID"].nunique()
}
print(unique)

In [None]:
from hcuppy.ccs import CCSEngine
from data.utils import read_files_and_combine
from data.longitudinal_utils import hcuppy_map_code

dx_df = read_files_and_combine([fname], args.ucla_control_data_dir)
print(dx_df.shape)
ce = CCSEngine(mode="dx")
icd10_mask = dx_df["ICD_TYPE"] == 10
exploded_cols = ["dx_CCS_CODE",
            "dx_CCS_DESCRIPTION",
            "dx_CCS_LEVEL1",
            "dx_CCS_LEVEL1_DESCRIPTION",
            "dx_CCS_LEVEL2",
            "dx_CCS_LEVEL2_DESCRIPTION",
        ]
mapped = hcuppy_map_code(
        dx_df,
        code_col="ICD_CODE",
        exploded_cols=exploded_cols,
        hcuppy_converter_function=ce.get_ccs,
    )
print(mapped.shape)
mapped.head()

In [None]:
mapped_dict = dx_df["ICD_CODE"].apply(lambda code: ce.get_ccs(code))
mapped_dict = pd.DataFrame(mapped_dict.tolist())
mapped_dict.columns = exploded_cols
mapped_dict

In [None]:
pd.concat([dx_df, mapped_dict], axis=1)

# Labs

In [None]:
labs = "Labs.txt"
crrt_df = read_files_and_combine([labs], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([labs], args.ucla_control_data_dir)

unique["labs"] = {
    "crrt": crrt_df["IP_PATIENT_ID"].nunique(),
    "ctrl": ctrl_df["IP_PATIENT_ID"].nunique()
}
print(unique)

In [None]:
crrt_df["COMPONENT_NAME"].unique()

In [None]:
ctrl_df["COMPONENT_NAME"].unique()

In [None]:
crrt_set = set(crrt_df["COMPONENT_NAME"].unique())
ctrl_set = set(ctrl_df["COMPONENT_NAME"].unique())
same = crrt_set.intersection(ctrl_set)
print(len(same))
same

In [None]:
diff = crrt_set.symmetric_difference(ctrl_set)
print(len(diff))
diff

In [None]:
import jellyfish
import itertools

def pairwise_diff(diff: set) -> pd.DataFrame:
    results = {
        combo: jellyfish.levenshtein_distance(*combo) for combo in itertools.permutations(diff, 2)
    }
    distances = pd.DataFrame(results.values(), index=results.keys()).sort_values(0)
    return distances

## In crrt but not control

In [None]:
dists = pairwise_diff(crrt_set - ctrl_set)
dists

## In control but not crrt

In [None]:
dists = pairwise_diff(ctrl_set - crrt_set)
dists

# Medications

In [None]:
meds = "Medications.txt"
crrt_df = read_files_and_combine([meds], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([meds], args.ucla_control_data_dir)

unique["meds"] = {
    "crrt": crrt_df["IP_PATIENT_ID"].nunique(),
    "ctrl": ctrl_df["IP_PATIENT_ID"].nunique()
}
print(unique)

In [None]:
crrt_df["PHARM_SUBCLASS"]

In [None]:
ctrl_df["medispan_subclass_name"]

In [None]:
crrt_set = set(crrt_df["PHARM_SUBCLASS"].unique())
ctrl_set = set(ctrl_df["MEDISPAN_SUBCLASS_NAME"].unique())

In [None]:
print(len(crrt_set.intersection(ctrl_set)))
diff = crrt_set.symmetric_difference(ctrl_set)
print(len(diff))

In [None]:
pairwise_diff(ctrl_set - crrt_set).iloc[:30]

In [None]:
pairwise_diff(crrt_set - ctrl_set)

# Problems

In [None]:
probs = "Problem_Lists.txt"
crrt_df = read_files_and_combine([probs], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([probs], args.ucla_control_data_dir)

unique["probs"] = {
    "crrt": crrt_df["IP_PATIENT_ID"].nunique(),
    "ctrl": ctrl_df["IP_PATIENT_ID"].nunique()
}
print(unique)

In [None]:
crrt_df.columns

# Procedures

In [None]:
procs = "Procedures.txt"
crrt_df = read_files_and_combine([procs], args.ucla_crrt_data_dir)
ctrl_df = read_files_and_combine([procs], args.ucla_control_data_dir)

unique["procs"] = {
    "crrt": crrt_df["IP_PATIENT_ID"].nunique(),
    "ctrl": ctrl_df["IP_PATIENT_ID"].nunique()
}
print(unique)

## time difference of different explode procedures

In [None]:
sample = crrt_df.sample(10000)
sample["VITAL_SIGN_TYPE"].replace(
        {"BP": "SBP/DBP", "BLOOD PRESSURE": "SBP/DBP"}, inplace=True
    )
explode_cols = ["VITAL_SIGN_VALUE", "VITAL_SIGN_TYPE"]

In [None]:
%%timeit -n 100

(
    sample.set_index(list(sample.columns.difference(explode_cols)))
        .apply(lambda col: col.str.split("/").explode())
        .reset_index()
        .reindex(sample.columns, axis=1)
)

In [None]:
%%timeit  -n 100
b = (
	sample.apply(
		lambda col: col.str.split("/") 
		if col.name in explode_cols else col
	).explode(explode_cols)
)

In [None]:
a = (
    sample.set_index(list(sample.columns.difference(explode_cols)))
        .apply(lambda col: col.str.split("/").explode())
        .reset_index()
        .reindex(sample.columns, axis=1)
)
b = (
	sample.apply(
		lambda col: col.str.split("/") 
		if col.name in explode_cols else col
	).explode(explode_cols)
    .reset_index(drop=True)  # reset index to be fair comparison
)
a.equals(b)