# Outcomes

In [None]:
import pandas as pd
from os.path import join
data_dir = "/home/davina/Private/crrt-data"
outcome_file = "CRRT Deidentified 2015-2021YTD_VF.xlsx"
id_mapping_file = "Patient_Identifiers.txt"

# get first sheet only
# TODO: What's sheet 0? I don't see it in excel
outcomes = pd.read_excel(join(data_dir, outcome_file), sheet_name=["Ped CRRT List", "2015-2021 YTD"])
peds_df = outcomes["Ped CRRT List"]
adult_df = outcomes["2015-2021 YTD"]

In [None]:
not_missing_half_values = peds_df.isna().mean(axis=1) < 0.5
drop_columns = ["PAT_NAME", "DOSAGE", "HOSP_ADMSN_DATE", "HOSP_DISCH_DATE", "Unnamed: 5", "Unnamed: 14"]
processed_peds = peds_df[not_missing_half_values].drop(drop_columns, axis=1)
processed_peds["Pediatric"] = 1
mrn_col_name = "PAT_MRN_ID"

In [None]:
# processed_adult = adult_df.fillna(0).drop(["Month", "Hospital name", "Unnamed: 11"], axis=1).rename({"Medical record number": mrn_col_name}, axis=1)
processed_adult = adult_df.fillna(0).drop(["Month", "Unnamed: 11"], axis=1).rename({"Medical record number": mrn_col_name}, axis=1)
processed_adult["Pediatric"] = 0

In [None]:
adult_and_peds_outcomes = pd.concat([processed_peds, processed_adult])
adult_and_peds_outcomes["CRRT Year"] =  pd.DatetimeIndex(adult_and_peds_outcomes["End Date"]).year
# map mrn to deidentified id by joining on mrn
id_mapping_df = pd.read_csv(join(data_dir, id_mapping_file))
adult_and_peds_outcomes = adult_and_peds_outcomes.merge(id_mapping_df, left_on="IP_PATIENT_ID", right_on="IP_PATIENT_ID", how="left")

In [None]:
from datetime import timedelta
#### Construct Start Date ####  -- For convenience of time-windows --
# Enforce date column to datetime object
adult_and_peds_outcomes["End Date"] = pd.to_datetime(adult_and_peds_outcomes["End Date"])

# CRRT Start Date = End Date - (Days on CRRT - 1)
# e.g. finish on the 10th and 3 days of CRRT: 8th (1), 9th (2), 10th (3)
offset = adult_and_peds_outcomes["CRRT Total Days"].map(lambda days: timedelta(days=days - 1))
adult_and_peds_outcomes["Start Date"] = adult_and_peds_outcomes["End Date"] - offset

In [None]:
adult_and_peds_outcomes["Age at Start of CRRT"] = (pd.DatetimeIndex(adult_and_peds_outcomes["Start Date"])- pd.DatetimeIndex(adult_and_peds_outcomes["DOB"])).days/365 

is_minor = adult_and_peds_outcomes["Age at Start of CRRT"] < 18
not_peds = adult_and_peds_outcomes["Pediatric"] == 0
mask = is_minor & not_peds

In [None]:
import matplotlib.pyplot as plt
age_from_pt_demographics = pd.read_csv(join(data_dir, "Patient_Demographics.txt"))[["IP_PATIENT_ID", "AGE"]]
age_from_dob = adult_and_peds_outcomes[["IP_PATIENT_ID", "Age at Start of CRRT"]]
age_values = age_from_pt_demographics.merge(age_from_dob, how="inner", on="IP_PATIENT_ID")
age_diff = age_values["AGE"] - age_values["Age at Start of CRRT"]
plt.hist(age_diff)

In [None]:
# peds patients do not overlap adults spreadsheet
processed_peds["PAT_MRN_ID"].isin(processed_adult["PAT_MRN_ID"]).sum()

In [None]:
# NO DOB for peds sheet
adult_and_peds_outcomes[adult_and_peds_outcomes["Pediatric"] == 1][["DOB", "Start Date", "Age at Start of CRRT"]]

In [None]:
# adult_and_peds_outcomes[is_minor | (adult_and_peds_outcomes['Hospital name'] == 'UCLA MEDICAL CENTER- PEDIATRICS')].to_csv("pediatrics_in_adult_sheet_outcomes.csv")

In [None]:
adult_and_peds_outcomes[is_minor][["DOB", "Start Date", "Age at Start of CRRT"]]

In [None]:
print(f"There are {mask.sum()} patients who have a DOB that indicates they are less than 18 years of age at the time of CRRT Start Date but are not in the pediatric data sheet.")
print(f"{(adult_and_peds_outcomes[mask]['Hospital name'] == 'UCLA MEDICAL CENTER- PEDIATRICS').sum()} of those patients indicate their hospital is pediatrics")
print(f"The remaining minors in the adult sheet indicate the hospital as {adult_and_peds_outcomes[mask][adult_and_peds_outcomes['Hospital name'] != 'UCLA MEDICAL CENTER- PEDIATRICS']['Hospital name'].values}")
print(f"There are a total of {(adult_and_peds_outcomes['Hospital name'] == 'UCLA MEDICAL CENTER- PEDIATRICS').sum()} patients in the adult sheet with peds hospital listed.")

In [None]:

adult_and_peds_outcomes.drop([mrn_col_name, "MRN", "DOB"],axis=1, inplace=True)

In [None]:
# looks like there's no mapping for peds
adult_and_peds_outcomes[adult_and_peds_outcomes["Pediatric"] == 1]["IP_PATIENT_ID"].isna().all()

In [None]:
adult_and_peds_outcomes = pd.concat([processed_peds, processed_adult])
adult_and_peds_outcomes["CRRT Year"] =  pd.DatetimeIndex(adult_and_peds_outcomes["End Date"]).year
# map values
adult_and_peds_outcomes[mrn_col_name] = adult_and_peds_outcomes[mrn_col_name].map(mapping)
# Rename column from MRN to deidendified patient ID
adult_and_peds_outcomes.rename(columns={mrn_col_name : "IP_PATIENT_ID"}, inplace=True)

adult_and_peds_outcomes[adult_and_peds_outcomes["Pediatric"] == 1]["IP_PATIENT_ID"].isna().all()

In [None]:

print(f"N outcomes: {len(outcomes)}, N unique patients: {outcomes['IP_PATIENT_ID'].unique().shape[0]}")
print(f"N missing patient ID: {outcomes['IP_PATIENT_ID'].isna().sum()} ({outcomes['IP_PATIENT_ID'].isna().mean():0.2f}%)")

print(f"Number adults: {exclude_peds.sum()} ({exclude_peds.mean()*100:0.2f}%)")
print(f"Number peds: {(~exclude_peds).sum()} ({(~exclude_peds).mean()*100:0.2f}%)")

In [None]:
df =  read_files_and_combine(["Patient_Demographics_19-000093_10082020.txt"])

In [None]:
from IPython.display import display

id_not_na = outcomes[outcomes["IP_PATIENT_ID"].notna()]

num_observations_for_patient = id_not_na.groupby("IP_PATIENT_ID", dropna=False).size()
pts_with_multiple_obs = num_observations_for_patient[num_observations_for_patient > 1]
print(f"count: {len(pts_with_multiple_obs)}, min: {pts_with_multiple_obs.min()}, max: {pts_with_multiple_obs.max()}")

duplicates = id_not_na[id_not_na["IP_PATIENT_ID"].duplicated(keep=False)]
duplicates.sort_values(["IP_PATIENT_ID", "End Date"])

In [None]:
def overlapping_treatments(df: pd.DataFrame) -> bool:
    from datetime import timedelta
    start_date = pd.to_datetime(df["End Date"]) - df["CRRT Total Days"].map(lambda days: timedelta(days=days))
    next_start_date = start_date.shift(-1)
    overlapping = not (pd.to_datetime(df["End Date"]) <= next_start_date).iloc[:-1].all()
    return overlapping

overlapping = duplicates.groupby("IP_PATIENT_ID").filter(overlapping_treatments)
print(f"N patients with overlapping treatments: {len(overlapping.drop_duplicates('IP_PATIENT_ID'))}")
overlapping.sort_values(["IP_PATIENT_ID", "End Date"])

# Data

In [None]:
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), "../module_code"))

from data.utils import read_files_and_combine

data_files = [
    "Allergies_19-000093_10082020.txt",
    "Patient_Demographics_19-000093_10082020.txt",
    "Social_History_19-000093_10082020.txt",
    "enc_19-000093_10082020.txt",
    "Encounter_Diagnoses_19-000093_10082020.txt",
    "Encounters_19-000093_10082020.txt",
    "Family_History_19-000093_10082020.txt",
    "Flowsheet_Vitals_19-000093_10082020.txt",
    "Hospital_Unit_Transfers_19-000093_10082020.txt",
    "Labs_19-000093_10082020.txt",
    "Medications_19-000093_10082020.txt",
    "problem_list_diagnoses_19-000093_10082020.txt",
    "Problem_Lists_19-000093_10082020.txt",
    "Procedures_19-000093_10082020.txt",
]

In [None]:
for file in data_files:
    df = read_files_and_combine([file])
    merged = pd.merge(df, adult_outcomes, on="IP_PATIENT_ID", how="inner")
    print("*"*25 +  f" {file.split('_')[0]} " + "*" * 25)
    print(f"N entries: {len(df)}, N unique IDs: {len(df.drop_duplicates('IP_PATIENT_ID'))}")
    print(f"MERGED WITH OUTCOME: N entries {len(merged)}, N unique IDs: {len(merged.drop_duplicates('IP_PATIENT_ID'))}")
    print(f"Difference in # unique patients: {len(df.drop_duplicates('IP_PATIENT_ID'))  - len(merged.drop_duplicates('IP_PATIENT_ID'))}")