In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

In [None]:
# I currently have 16 files (not including Bui_CRRT_files_email for syntax)

# 16 files in total
# 3 for static features
# 9 for encounters (minus labs (1) and procedures (1)) - longitudinal features
# 1 for mapping file
# 1 for outcome file

#data_dir = "/home/davina/Private/dialysis-data"
data_dir = r"C:\Users\arvin\Documents\ucla research\CRRT project"
static_features = [
    "Allergies_19-000093_10082020.txt",
    "Patient_Demographics_19-000093_10082020.txt",
    "Social_History_19-000093_10082020.txt",
]
encounters = [
    "enc_19-000093_10082020.txt", # general visits to the doctors and general reasons
    "Encounter_Diagnoses_19-000093_10082020.txt",
    "Encounters_19-000093_10082020.txt",
    "Family_History_19-000093_10082020.txt",
    "Flowsheet_Vitals_19-000093_10082020.txt",
    "Hospital_Unit_Transfers_19-000093_10082020.txt",
#     "Labs_19-000093_10082020.txt",
    "Medications_19-000093_10082020.txt",
    "problem_list_diagnoses_19-000093_10082020.txt",
    "Problem_Lists_19-000093_10082020.txt",
#     "Procedures_19-000093_10082020.txt",
]
provider_mapping_file = "providers_19-000093_10082020.txt"
outcome_file = "CRRT Deidentified 2017-2019.csv"
# files = static_features + encounters
files = static_features

dfs = []

In [None]:
for file in files:
    try:
        dfs.append(pd.read_csv(f"{data_dir}\\{file}"))
    except:
        print(f"Unexpected encoding in {file}")
        default_guess = "cp1252"
        import os
        # get file encoding using file -i and extracting name with sed
        # ref: https://unix.stackexchange.com/a/393949
        # -n: don't print unless we say. s/ search, .* match any, charset=, // remove text up until after =, print remaining
        command = f"file -i {data_dir}/{file} | sed -n 's/.*charset=//p'"
        # [:-1] ignore newline
        encoding = os.popen(command).read()[:-1]
        print(f"Encoding was {encoding} instead of assumed utf-8.")
        if encoding == "unknown-8bit":
            print(f"Assuming {default_guess}...")
            dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=default_guess))
        else:
            dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=encoding))

In [None]:
from functools import reduce
combined = reduce(lambda df1, df2: pd.merge(df1, df2, on="IP_PATIENT_ID", how="inner"), dfs)

# Preprocessing features

In [None]:
# map provider id to type
provider_mapping = pd.read_csv(f"{data_dir}\\{provider_mapping_file}")
provider_mapping = dict(zip(provider_mapping["IP_PROVIDER_ID"], provider_mapping["PROVIDER_TYPE"]))
combined["PCP_IP_PROVIDER_ID"] = combined["PCP_IP_PROVIDER_ID"].map(provider_mapping)
combined.rename(columns={"PCP_IP_PROVIDER_ID" : "PCP_PROVIDER_TYPE"}, inplace=True)

In [None]:
combined

In [None]:
combined["IP_PATIENT_ID"].nunique()

# Load + Preproc Outcomes

In [None]:
# get first sheet only
outcomes = pd.read_csv(f"{data_dir}\\{outcome_file}")

# Exclude pediatric data
#exclude_peds = outcomes["Hospital name"] != "UCLA MEDICAL CENTER- PEDIATRICS"
#outcomes = outcomes[exclude_peds]

outcomes

# Validate Outcomes

In [None]:
positive_outcomes = ["Recov. renal funct.", "Transitioned to HD"]
negative_outcomes = ["Palliative Care", "Expired "] 
outcome_cols = positive_outcomes + negative_outcomes
outcomes[outcome_cols]

In [None]:
# Each row should have exactly 1 1.0 value (one-hot of the 4 cols)
bad_rows = outcomes[outcome_cols].fillna(0).sum(axis=1) == 0
outcomes[bad_rows]
## TODO: Should i drop the bad row?

# Construct outcome feature (recommend dialysis)

In [None]:
recommend_dialysis = (outcomes["Recov. renal funct."] == 1) | (outcomes["Transitioned to HD"] == 1)
outcomes["recommend_dialysis"] =  recommend_dialysis.astype(int)

# To combine with features
outcome_df = outcomes[["IP_PATIENT_ID", "recommend_dialysis"]]

In [None]:
sum(outcome_df["recommend_dialysis"])/len(outcome_df) * 100

# Merge features with outcome

In [None]:
features_with_outcomes = pd.merge(combined, outcome_df, on="IP_PATIENT_ID", how="inner")
features_with_outcomes

# Feature Engineering

## Fill in Missing Values

In [None]:
# TODO: mean should be done on the training set
features_with_outcomes['AGE'].fillna(features_with_outcomes['AGE'].mean(), inplace=True)
features_with_outcomes['TOBACCO_PAK_PER_DY'].fillna(0, inplace=True)
features_with_outcomes['TOBACCO_USED_YEARS'].fillna(0, inplace=True)
features_with_outcomes['TOBACCO_USER'].fillna('Never', inplace=True)
features_with_outcomes['ALCOHOL_USER'].fillna('No', inplace=True)
features_with_outcomes['ALCOHOL_OZ_PER_WK'].fillna(0, inplace=True)
features_with_outcomes['ILLICIT_DRUG_FREQ'].fillna(0, inplace=True)

In [None]:
# convert ALCOHOL_OZ_PER_WK to numeric
def alc_freq_to_numeric(x):
    if x == "0":
        return 0
    if x == "3.6 - 4.2":
        return 3.9
    if x == ".6":
        return .6
    if x == "3.6":
        return 3.6
    if x == "1.8 - 3":
        return 2.4
    if x == "1.8":
        return 1.8
    if x == "2.4":
        return 2.4
    if x == "6":
        return 6
    if x == "8.4":
        return 8.4
    if x == ".6 - 1.2":
        return 0.8
    if x == "12.6":
        return 0.0
    if x == 0:
        return 0
    else:
        raise ValueError("Invalid entry: {}".format(x))
features_with_outcomes['ALCOHOL_OZ_PER_WK'] = features_with_outcomes['ALCOHOL_OZ_PER_WK'].apply(alc_freq_to_numeric)

## One-Hot Encoding

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
cat_features = features_with_outcomes[['GENDER', 'RACE', 
                                      'ETHNICITY', 'PCP_PROVIDER_TYPE',
                                      'TOBACCO_USER',
                                      'CIGARETTES_YN',
                                      'SMOKING_TOB_STATUS',
                                      'ALCOHOL_USER',
                                      'ALCOHOL_TYPE',
                                      'IV_DRUG_USER_YN', 'ALLERGEN_ID']].to_numpy()
OH_features = enc.fit_transform(cat_features).toarray()

In [None]:
#enc.categories_

In [None]:
sum(OH_features[:,0]), sum(OH_features[:,1]), sum(OH_features[:,2])

In [None]:
OH_features.shape

## Generate Feature Vector

In [None]:
real_features = features_with_outcomes[['AGE', 'TOBACCO_PAK_PER_DY', 'TOBACCO_USED_YEARS', 'ALCOHOL_OZ_PER_WK',
                                   'ILLICIT_DRUG_FREQ']].to_numpy()
real_features.shape

In [None]:
features = np.concatenate((OH_features, real_features), axis=1)
features.shape

In [None]:
targets = features_with_outcomes['recommend_dialysis'].to_numpy()

# Classification Models (using CV)

In [None]:
# TODO: split by patient id

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
aucrocs = []
accs = []
f1_scores = []
for train_index, test_index in skf.split(features, targets):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

    #clf = LogisticRegression()
    #clf = SVC(probability=True)
    #clf = DecisionTreeClassifier()
    #clf = RandomForestClassifier()
    #clf = KNeighborsClassifier(3)
    #clf = MLPClassifier(alpha=1, max_iter=1000)
    #clf = GaussianNB()
    clf = MultinomialNB()
    #clf = AdaBoostClassifier()
    #clf = QuadraticDiscriminantAnalysis()
    #clf = GaussianProcessClassifier(1.0 * RBF(1.0))
    clf.fit(X_train, y_train)
    pred_probs = clf.predict_proba(X_test)[:,1]
    aucroc = roc_auc_score(y_test, pred_probs)
    acc = accuracy_score(y_test, np.round(pred_probs))
    f1_score_ = f1_score(y_test, np.round(pred_probs))
    print("aucroc: {}, acc: {}, f1_score: {}".format(aucroc, acc, f1_score_))
    aucrocs.append(aucroc)
    accs.append(acc)
    f1_scores.append(f1_score_)
print("mean -- aucroc: {}, acc: {}, f1_score: {}".format(np.mean(aucrocs), np.mean(accs), np.mean(f1_scores)))