In [None]:
import pandas as pd

data_dir = "/home/davina/Private/dialysis-data"
static_features = [
    "Allergies_19-000093_10082020.txt",
    "Patient_Demographics_19-000093_10082020.txt",
    "Social_History_19-000093_10082020.txt",
]
encounters = [
    "enc_19-000093_10082020.txt",
    "Encounter_Diagnoses_19-000093_10082020.txt",
    "Encounters_19-000093_10082020.txt",
    "Family_History_19-000093_10082020.txt",
    "Flowsheet_Vitals_19-000093_10082020.txt",
    "Hospital_Unit_Transfers_19-000093_10082020.txt",
#     "Labs_19-000093_10082020.txt",
    "Medications_19-000093_10082020.txt",
    "problem_list_diagnoses_19-000093_10082020.txt",
    "Problem_Lists_19-000093_10082020.txt",
#     "Procedures_19-000093_10082020.txt",
]
provider_mapping_file = "providers_19-000093_10082020.txt"
outcome_file = "CRRT Deidentified 2017-2019.csv"
# files = static_features + encounters
files = static_features

dfs = []

In [None]:
for file in files:
    try:
        dfs.append(pd.read_csv(f"{data_dir}/{file}"))
    except:
        print(f"Unexpected encoding in {file}")
        default_guess = "cp1252"
        import os
        # get file encoding using file -i and extracting name with sed
        # ref: https://unix.stackexchange.com/a/393949
        # -n: don't print unless we say. s/ search, .* match any, charset=, // remove text up until after =, print remaining
        command = f"file -i {data_dir}/{file} | sed -n 's/.*charset=//p'"
        # [:-1] ignore newline
        encoding = os.popen(command).read()[:-1]
        print(f"Encoding was {encoding} instead of assumed utf-8.")
        if encoding == "unknown-8bit":
            print(f"Assuming {default_guess}...")
            dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=default_guess))
        else:
            dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=encoding))

In [None]:
from functools import reduce
combined = reduce(lambda df1, df2: pd.merge(df1, df2, on="IP_PATIENT_ID", how="inner"), dfs)

# Preprocessing features

In [None]:
# map provider id to type
provider_mapping = pd.read_csv(f"{data_dir}/{provider_mapping_file}")
provider_mapping = dict(zip(provider_mapping["IP_PROVIDER_ID"], provider_mapping["PROVIDER_TYPE"]))
combined["PCP_IP_PROVIDER_ID"] = combined["PCP_IP_PROVIDER_ID"].map(provider_mapping)
combined.rename(columns={"PCP_IP_PROVIDER_ID" : "PCP_PROVIDER_TYPE"}, inplace=True)

In [None]:
combined

In [None]:
combined["IP_PATIENT_ID"].nunique()

# Load + Preproc Outcomes

In [None]:
# get first sheet only
outcomes = pd.read_csv(f"{data_dir}/{outcome_file}")

# Exclude pediatric data
exclude_peds = outcomes["Hospital name"] != "UCLA MEDICAL CENTER- PEDIATRICS"
outcomes = outcomes[exclude_peds]

outcomes

# Validate Outcomes

In [None]:
positive_outcomes = ["Recov. renal funct.", "Transitioned to HD"]
negative_outcomes = ["Palliative Care", "Expired "] 
outcome_cols = positive_outcomes + negative_outcomes
outcomes[outcome_cols]

In [None]:
# Each row should have exactly 1 1.0 value (one-hot of the 4 cols)
bad_rows = outcomes[outcome_cols].fillna(0).sum(axis=1) == 0
outcomes[bad_rows]
## TODO: Should i drop the bad row?

# Construct outcome feature (recommend dialysis)

In [None]:
recommend_dialysis = (outcomes["Recov. renal funct."] == 1) | (outcomes["Transitioned to HD"] == 1)
outcomes["recommend_dialysis"] =  recommend_dialysis.astype(int)

# To combine with features
outcome_df = outcomes[["IP_PATIENT_ID", "recommend_dialysis"]]

In [None]:
sum(outcome_df["recommend_dialysis"])/len(outcome_df) * 100

# Merge features with outcome

In [None]:
features_with_outcomes = pd.merge(combined, outcome_df, on="IP_PATIENT_ID", how="inner")
features_with_outcomes