In [None]:
import pandas as pd

data_dir = "/home/davina/Private/dialysis-data"
static_features = [
    "Allergies_19-000093_10082020.txt",
    "Patient_Demographics_19-000093_10082020.txt",
    "Social_History_19-000093_10082020.txt",
]
encounters = [
    "enc_19-000093_10082020.txt",
    "Encounter_Diagnoses_19-000093_10082020.txt",
    "Encounters_19-000093_10082020.txt",
    "Family_History_19-000093_10082020.txt",
    "Flowsheet_Vitals_19-000093_10082020.txt",
    "Hospital_Unit_Transfers_19-000093_10082020.txt",
#     "Labs_19-000093_10082020.txt",
    "Medications_19-000093_10082020.txt",
    "problem_list_diagnoses_19-000093_10082020.txt",
    "Problem_Lists_19-000093_10082020.txt",
#     "Procedures_19-000093_10082020.txt",
]
provider_mapping_file = "providers_19-000093_10082020.txt"
outcome_file = "CRRT Deidentified 2017-2019.csv"

In [None]:
from functools import reduce
from typing import List

def read_files_and_combine(files: List[str], how="inner"):
    dfs = []

    for file in files:
        try:
            dfs.append(pd.read_csv(f"{data_dir}/{file}"))
        except:
            print(f"Unexpected encoding in {file}")
            default_guess = "cp1252"
            import os
            # get file encoding using file -i and extracting name with sed
            # ref: https://unix.stackexchange.com/a/393949
            # -n: don't print unless we say. s/ search, .* match any, charset=, // remove text up until after =, print remaining
            command = f"file -i {data_dir}/{file} | sed -n 's/.*charset=//p'"
            # [:-1] ignore newline
            encoding = os.popen(command).read()[:-1]
            print(f"Encoding was {encoding} instead of assumed utf-8.")
            if encoding == "unknown-8bit":
                print(f"Assuming {default_guess}...")
                dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=default_guess))
            else:
                dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=encoding))
    combined = reduce(lambda df1, df2: pd.merge(df1, df2, on="IP_PATIENT_ID", how=how), dfs)
    return combined

# Load + Preproc Outcomes

In [None]:
# get first sheet only
outcomes = pd.read_csv(f"{data_dir}/{outcome_file}")

# Exclude pediatric data
exclude_peds = outcomes["Hospital name"] != "UCLA MEDICAL CENTER- PEDIATRICS"
outcomes = outcomes[exclude_peds]

# TODO: include CRRT Total Days as predictive feature

outcomes

## Validate Outcomes

In [None]:
# Date range for outcomes
outcomes["End Date"] = pd.to_datetime(outcomes["End Date"])
f"min date: {outcomes['End Date'].min()}. max date: {outcomes['End Date'].max()}"

In [None]:
positive_outcomes = ["Recov. renal funct.", "Transitioned to HD"]
negative_outcomes = ["Palliative Care", "Expired "] 
outcome_cols = positive_outcomes + negative_outcomes
outcomes[outcome_cols]

In [None]:
# Each row should have exactly 1 1.0 value (one-hot of the 4 cols)
bad_rows = outcomes[outcome_cols].fillna(0).sum(axis=1) == 1
outcomes[bad_rows]
## TODO: Should i drop the bad row?

## Construct outcome feature (recommend dialysis)

In [None]:
recommend_dialysis = (outcomes[positive_outcomes] == 1).any(axis=1)
outcomes["recommend_dialysis"] =  recommend_dialysis.astype(int)

# To combine with features
outcome_df = outcomes[["IP_PATIENT_ID", "recommend_dialysis"]]

In [None]:
sum(outcome_df["recommend_dialysis"])/len(outcome_df) * 100

In [None]:
outcomes[positive_outcomes + negative_outcomes + ["recommend_dialysis"]].sum(axis=0) / len(outcome_df) * 100

In [None]:
import seaborn as sns
sns.boxplot(outcomes["CRRT Total Days"])

In [None]:
sns.boxplot(data=outcomes, y="CRRT Total Days", x="recommend_dialysis")
outcomes["CRRT Total Days"].describe()

In [None]:
len(outcomes["IP_PATIENT_ID"].unique())

# Preprocessing features

In [None]:
static_df = read_files_and_combine(static_features)

In [None]:
# map provider id to type
provider_mapping = pd.read_csv(f"{data_dir}/{provider_mapping_file}")
provider_mapping = dict(zip(provider_mapping["IP_PROVIDER_ID"], provider_mapping["PROVIDER_TYPE"]))
static_df["PCP_IP_PROVIDER_ID"] = static_df["PCP_IP_PROVIDER_ID"].map(provider_mapping)
static_df.rename(columns={"PCP_IP_PROVIDER_ID" : "PCP_PROVIDER_TYPE"}, inplace=True)

In [None]:
static_df

In [None]:
static_df["IP_PATIENT_ID"].nunique()

## LONGITUDINAL FEATURES

### Limit Timeframe

In [None]:
import numpy as np
from scipy.stats import skew
from datetime import timedelta

aggregate_functions = [min, max, np.mean, np.std, skew, len]
longitudinal_df = []

TIME_WINDOW = {
    "YEARS": 1,
    "MONTHS": 0
}

# TODO: patients that are in the other files but not in the outcome files will not have a corresponding entry/time.
def get_time_window_mask(df: pd.DataFrame, timestamp_feature_name: str) -> pd.DataFrame:
    merged_df = df.merge(outcomes[["IP_PATIENT_ID", "End Date"]], on="IP_PATIENT_ID", how="left")
    dates = pd.to_datetime(merged_df[timestamp_feature_name])
    mask = dates >= (merged_df["End Date"] - timedelta(days=360*TIME_WINDOW["YEARS"], weeks=4*TIME_WINDOW["MONTHS"]))
    print(f"Dropping {df.shape[0] - sum(mask)} rows outside of time window.")
    return merged_df[mask].drop("End Date", axis=1)

### Diagnoses

In [None]:
dx_df = read_files_and_combine(["Encounter_Diagnoses_19-000093_10082020.txt"])
# Top N codes
top_n = 15
dx_df[dx_df["ICD_TYPE"] == 10].groupby("ICD_CODE").size().sort_values(ascending=False)[:top_n]

#### ICD9 VS ICD10

In [None]:
icd10_n = (dx_df['ICD_TYPE'] == 10).sum()
icd9_n = (dx_df['ICD_TYPE'] == 9).sum()
f"ICD 10: {icd10_n}, ICD 9: {icd9_n}, ratio of 10:9: {icd10_n / icd9_n}"

In [None]:
# Date ranges for ICD9 codes (min and max)
icd9_dx_dates = pd.to_datetime(dx_df[dx_df["ICD_TYPE"] == 9]["DIAGNOSIS_DATE"])
f"min date: {icd9_dx_dates.min()}, max_date: {icd9_dx_dates.max()}"

ICD9 is 2013 - 2015, outcomes are 2018-2019, so we will ignore all ICD9 codes and use HCUP to map ICD10 codes to CSSR categories to reduce the number of categories.

#### ICD10 to CSSR

In [None]:
from hcuppy.ccs import CCSEngine
ce = CCSEngine(mode="dx")
# convert icd10 to ccs
ccs_dict = dx_df[dx_df["ICD_TYPE"] == 10]["ICD_CODE"].apply(lambda icd_code: ce.get_ccs(icd_code))
# series of dicts, explode each dict attribute to its own column
ccs_dict = pd.DataFrame(ccs_dict.values.tolist())
ccs_dict.columns = ["CCS_CODE", "CCS_DESCRIPTION", "CCS_LEVEL1", "CCS_LEVEL1_DESCRIPTION", "CCS_LEVEL2", "CCS_LEVEL2_DESCRIPTION"]

# combine the granular icd codes with the higher level CCS ones
dx_df = pd.concat([dx_df, ccs_dict], axis=1)

dx_df.groupby("CCS_CODE").size().sort_values(ascending=False)

In [None]:
dx_df = get_time_window_mask(dx_df, "DIAGNOSIS_DATE")
dx_df

In [None]:
onehot_ccs = pd.get_dummies(dx_df[["IP_PATIENT_ID", "CCS_CODE"]], columns=["CCS_CODE"])
onehot_ccs = onehot_ccs.groupby("IP_PATIENT_ID").apply(lambda df: df.sum(axis=0))

In [None]:
onehot_ccs = onehot_ccs.drop("IP_PATIENT_ID", axis=1).reset_index()
onehot_ccs

### Vitals

These looks a little strange right now.

In [None]:
vitals_df = read_files_and_combine(["Flowsheet_Vitals_19-000093_10082020.txt"])

In [None]:
# Split BP into SBP and DBP
vitals_df["VITAL_SIGN_TYPE"].replace({"BP": "SBP/DBP"}, inplace=True)
explode_cols = ["VITAL_SIGN_VALUE", "VITAL_SIGN_TYPE"]
def try_split_col(col: pd.Series):
    # Split col with "/" in it (only BP values and name) from explode_cols
    try:
        return col.str.split("/").explode()
    except:
        return col

# Ref: https://stackoverflow.com/a/57122617/1888794
# don't explode the columsn you set index to, explode the rest via apply, reset everything to normal
vitals_df = (vitals_df.set_index(list(vitals_df.columns.difference(explode_cols)))
                    .apply(try_split_col)
                    .reset_index()
                    .reindex(vitals_df.columns, axis=1))

In [None]:
old_size = vitals_df.shape[0]

# drop duplicates for the same patient for the same vital (taken at same time indicates duplicate)
vitals_df = vitals_df.drop_duplicates(subset=["IP_PATIENT_ID", "VITAL_SIGN_TYPE", "VITAL_SIGN_TAKEN_TIME"])
f"Dropped {old_size - vitals_df.shape[0]} rows that were duplicates."

In [None]:
# these vitals are not float point numbers, we want to ignore them and then convert the vitals to float to aggregate
ignore_vitals = ["O2 Device"]
ignore_mask = ~vitals_df["VITAL_SIGN_TYPE"].isin(ignore_vitals)
vitals_df = vitals_df[ignore_mask]

# filter to window
vitals_df = get_time_window_mask(vitals_df, "VITAL_SIGN_TAKEN_TIME")

# convert to float
vitals_df["VITAL_SIGN_VALUE"] = vitals_df["VITAL_SIGN_VALUE"].astype(float)
# Aggregate
vitals_df = vitals_df.groupby(["IP_PATIENT_ID", "VITAL_SIGN_TYPE"]).agg({"VITAL_SIGN_VALUE": aggregate_functions})

In [None]:
vitals_df

In [None]:
vitals_df = vitals_df.unstack()
vitals_df.columns = vitals_df.columns.map('_'.join)
vitals_df.reset_index(inplace=True)

### Medications

In [None]:
rx_df = read_files_and_combine(["Medications_19-000093_10082020.txt"])
rx_df

In [None]:
rx_df.groupby("GENERIC_NAME").size().sort_values(ascending=False)

In [None]:
rx_df = get_time_window_mask(rx_df, "ORDER_DATE")

### Labs

In [None]:
labs_df = read_files_and_combine(["Labs_19-000093_10082020.txt"])

In [None]:
labs_df = get_time_window_mask(labs_df, "ORDER_TIME")

In [None]:
# Force numeric, ignore strings
labs_df["RESULTS"] = pd.to_numeric(labs_df["RESULTS"], errors='coerce')

In [None]:
labs_df = labs_df.groupby(["IP_PATIENT_ID", "DESCRIPTION"]).agg({"RESULTS": aggregate_functions})

In [None]:
labs_df = labs_df.unstack()
labs_df.columns = labs_df.columns.map('_'.join)
labs_df.reset_index(inplace=True)
labs_df

### Problems

In [None]:
problems_df = read_files_and_combine(["Problem_Lists_19-000093_10082020.txt"])
problem_list_df = read_files_and_combine(["problem_list_diagnoses_19-000093_10082020.txt"])

In [None]:
problems_df = problem_list_df.merge(problems_df, on=["ip_patient_id", "ip_encounter_id"])

In [None]:
problems_df.columns = [col.upper() for col in problems_df.columns]
problems_df = get_time_window_mask(problems_df, "NOTED_DATE")

In [None]:
problems_df.groupby("PROBLEM_STATUS").size()

ICD9 is 2013 - 2015, outcomes are 2018-2019, so we will ignore all ICD9 codes and use HCUP to map ICD10 codes to CSSR categories to reduce the number of categories.

In [None]:
from hcuppy.ccs import CCSEngine
ce = CCSEngine(mode="dx")
# convert icd10 to ccs
ccs_dict = problems_df[(problems_df["PROBLEM_STATUS"] == "Active") & (problems_df["ICD_TYPE"] == 10)]["ICD_CODE"].apply(lambda icd_code: ce.get_ccs(icd_code))
# series of dicts, explode each dict attribute to its own column
ccs_dict = pd.DataFrame(ccs_dict.values.tolist())
ccs_dict.columns = ["CCS_CODE", "CCS_DESCRIPTION", "CCS_LEVEL1", "CCS_LEVEL1_DESCRIPTION", "CCS_LEVEL2", "CCS_LEVEL2_DESCRIPTION"]

# combine the granular icd codes with the higher level CCS ones
problems_df = pd.concat([problems_df, ccs_dict], axis=1)

problems_df.groupby("CCS_CODE").size().sort_values(ascending=False)

In [None]:
onehot_problems = pd.get_dummies(problems_df[["IP_PATIENT_ID", "CCS_CODE"]], columns=["CCS_CODE"])
onehot_problems = onehot_problems.groupby("IP_PATIENT_ID").apply(lambda df: df.sum(axis=0))

In [None]:
onehot_problems = onehot_problems.drop("IP_PATIENT_ID", axis=1).reset_index()
onehot_problems

### Procedures

In [None]:
procedure_df = read_files_and_combine(["Procedures_19-000093_10082020.txt"])

In [None]:
grouped_proc = procedure_df.groupby("PROC_CODE").size().sort_values(ascending=False)
grouped_proc

In [None]:
sum(grouped_proc == 1)

In [None]:
from hcuppy.cpt import CPT
cpt = CPT()

# Get section and description
cpt_dict = procedure_df["PROC_CODE"].apply(lambda cpt_code: cpt.get_cpt_section(cpt_code))
# series of dicts, explode each dict into its own column
cpt_dict = pd.DataFrame(cpt_dict.values.tolist())
cpt_dict.columns = ["CPT_SECTION", "SECTION_DESCRIPTION"]

# combine the granular procedure cpt codes with the higher level ones from hcuppy
procedure_df = pd.concat([procedure_df, cpt_dict], axis=1)

In [None]:
procedure_df.groupby("CPT_SECTION").size().sort_values(ascending=False)

In [None]:
procedure_df = get_time_window_mask(procedure_df, "PROC_DATE")

In [None]:
onehot_proc = pd.get_dummies(procedure_df[["IP_PATIENT_ID", "CPT_SECTION"]], columns=["CPT_SECTION"])
onehot_proc = onehot_proc.groupby("IP_PATIENT_ID").apply(lambda df: df.sum(axis=0))

In [None]:
onehot_proc = onehot_proc.drop("IP_PATIENT_ID", axis=1).reset_index()
onehot_proc

# Merge features with outcome

In [None]:
longitudinal_dfs = [onehot_ccs, vitals_df, rx_df, labs_df, problems_df, procedures_df]
features_with_outcomes = reduce(lambda df1, df2: pd.merge(df1, df2, on="IP_PATIENT_ID", how="inner"),
                                longitudinal_dfs + [static_df, outcome_df])
features_with_outcomes

In [None]:
dx_vitals