# Outcomes

In [None]:
import pandas as pd
data_dir = "/home/davina/Private/dialysis-data"
outcome_file = "CRRT Deidentified 2017-2019.csv"

# get first sheet only
outcomes = pd.read_csv(f"{data_dir}/{outcome_file}")

# Exclude pediatric data
exclude_peds = outcomes["Hospital name"] != "UCLA MEDICAL CENTER- PEDIATRICS"
adult_outcomes = outcomes[exclude_peds]

print(f"N outcomes: {len(outcomes)}, N unique patients: {outcomes['IP_PATIENT_ID'].unique().shape[0]}")
print(f"N missing patient ID: {outcomes['IP_PATIENT_ID'].isna().sum()} ({outcomes['IP_PATIENT_ID'].isna().mean():0.2f}%)")

print(f"Number adults: {exclude_peds.sum()} ({exclude_peds.mean()*100:0.2f}%)")
print(f"Number peds: {(~exclude_peds).sum()} ({(~exclude_peds).mean()*100:0.2f}%)")

In [None]:
df =  read_files_and_combine(["Patient_Demographics_19-000093_10082020.txt"])

In [None]:
from IPython.display import display

id_not_na = outcomes[outcomes["IP_PATIENT_ID"].notna()]

num_observations_for_patient = id_not_na.groupby("IP_PATIENT_ID", dropna=False).size()
pts_with_multiple_obs = num_observations_for_patient[num_observations_for_patient > 1]
print(f"count: {len(pts_with_multiple_obs)}, min: {pts_with_multiple_obs.min()}, max: {pts_with_multiple_obs.max()}")

duplicates = id_not_na[id_not_na["IP_PATIENT_ID"].duplicated(keep=False)]
duplicates.sort_values(["IP_PATIENT_ID", "End Date"])

In [None]:
def overlapping_treatments(df: pd.DataFrame) -> bool:
    from datetime import timedelta
    start_date = pd.to_datetime(df["End Date"]) - df["CRRT Total Days"].map(lambda days: timedelta(days=days))
    next_start_date = start_date.shift(-1)
    overlapping = not (pd.to_datetime(df["End Date"]) <= next_start_date).iloc[:-1].all()
    return overlapping

overlapping = duplicates.groupby("IP_PATIENT_ID").filter(overlapping_treatments)
print(f"N patients with overlapping treatments: {len(overlapping.drop_duplicates('IP_PATIENT_ID'))}")
overlapping.sort_values(["IP_PATIENT_ID", "End Date"])

# Data

In [None]:
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), "../module_code"))

from data.utils import read_files_and_combine

data_files = [
    "Allergies_19-000093_10082020.txt",
    "Patient_Demographics_19-000093_10082020.txt",
    "Social_History_19-000093_10082020.txt",
    "enc_19-000093_10082020.txt",
    "Encounter_Diagnoses_19-000093_10082020.txt",
    "Encounters_19-000093_10082020.txt",
    "Family_History_19-000093_10082020.txt",
    "Flowsheet_Vitals_19-000093_10082020.txt",
    "Hospital_Unit_Transfers_19-000093_10082020.txt",
    "Labs_19-000093_10082020.txt",
    "Medications_19-000093_10082020.txt",
    "problem_list_diagnoses_19-000093_10082020.txt",
    "Problem_Lists_19-000093_10082020.txt",
    "Procedures_19-000093_10082020.txt",
]

In [None]:
for file in data_files:
    df = read_files_and_combine([file])
    merged = pd.merge(df, adult_outcomes, on="IP_PATIENT_ID", how="inner")
    print("*"*25 +  f" {file.split('_')[0]} " + "*" * 25)
    print(f"N entries: {len(df)}, N unique IDs: {len(df.drop_duplicates('IP_PATIENT_ID'))}")
    print(f"MERGED WITH OUTCOME: N entries {len(merged)}, N unique IDs: {len(merged.drop_duplicates('IP_PATIENT_ID'))}")
    print(f"Difference in # unique patients: {len(df.drop_duplicates('IP_PATIENT_ID'))  - len(merged.drop_duplicates('IP_PATIENT_ID'))}")