In [54]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

In [2]:
# I currently have 16 files (not including Bui_CRRT_files_email for syntax)

# 16 files in total
# 3 for static features
# 9 for encounters (minus labs (1) and procedures (1)) - longitudinal features
# 1 for mapping file
# 1 for outcome file

#data_dir = "/home/davina/Private/dialysis-data"
data_dir = r"C:\Users\arvin\Documents\ucla research\CRRT project"
static_features = [
    "Allergies_19-000093_10082020.txt",
    "Patient_Demographics_19-000093_10082020.txt",
    "Social_History_19-000093_10082020.txt",
]
encounters = [
    "enc_19-000093_10082020.txt", # general visits to the doctors and general reasons
    "Encounter_Diagnoses_19-000093_10082020.txt",
    "Encounters_19-000093_10082020.txt",
    "Family_History_19-000093_10082020.txt",
    "Flowsheet_Vitals_19-000093_10082020.txt",
    "Hospital_Unit_Transfers_19-000093_10082020.txt",
#     "Labs_19-000093_10082020.txt",
    "Medications_19-000093_10082020.txt",
    "problem_list_diagnoses_19-000093_10082020.txt",
    "Problem_Lists_19-000093_10082020.txt",
#     "Procedures_19-000093_10082020.txt",
]
provider_mapping_file = "providers_19-000093_10082020.txt"
outcome_file = "CRRT Deidentified 2017-2019.csv"
# files = static_features + encounters
files = static_features

dfs = []

In [3]:
for file in files:
    try:
        dfs.append(pd.read_csv(f"{data_dir}\\{file}"))
    except:
        print(f"Unexpected encoding in {file}")
        default_guess = "cp1252"
        import os
        # get file encoding using file -i and extracting name with sed
        # ref: https://unix.stackexchange.com/a/393949
        # -n: don't print unless we say. s/ search, .* match any, charset=, // remove text up until after =, print remaining
        command = f"file -i {data_dir}/{file} | sed -n 's/.*charset=//p'"
        # [:-1] ignore newline
        encoding = os.popen(command).read()[:-1]
        print(f"Encoding was {encoding} instead of assumed utf-8.")
        if encoding == "unknown-8bit":
            print(f"Assuming {default_guess}...")
            dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=default_guess))
        else:
            dfs.append(pd.read_csv(f"{data_dir}/{file}",  encoding=encoding))

In [4]:
from functools import reduce
combined = reduce(lambda df1, df2: pd.merge(df1, df2, on="IP_PATIENT_ID", how="inner"), dfs)

# Preprocessing features

In [5]:
# map provider id to type
provider_mapping = pd.read_csv(f"{data_dir}\\{provider_mapping_file}")
provider_mapping = dict(zip(provider_mapping["IP_PROVIDER_ID"], provider_mapping["PROVIDER_TYPE"]))
combined["PCP_IP_PROVIDER_ID"] = combined["PCP_IP_PROVIDER_ID"].map(provider_mapping)
combined.rename(columns={"PCP_IP_PROVIDER_ID" : "PCP_PROVIDER_TYPE"}, inplace=True)

In [6]:
combined

Unnamed: 0,IP_PATIENT_ID,ALLERGEN_ID,DESCRIPTION,AGE,GENDER,RACE,ETHNICITY,VITAL_STATUS,PCP_PROVIDER_TYPE,TOBACCO_PAK_PER_DY,...,TOBACCO_USER,CIGARETTES_YN,SMOKING_TOB_STATUS,SMOKING_START_DATE,SMOKING_QUIT_DATE,ALCOHOL_USER,ALCOHOL_OZ_PER_WK,ALCOHOL_TYPE,IV_DRUG_USER_YN,ILLICIT_DRUG_FREQ
0,00A2C5946EDDAB0737D3E58C6E9919EC,32829,UNKNOWN,69.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,Never,N,Never Smoker,,,No,,,N,
1,00BB78DEA4FD324D7AAAC03E361BA345,3263,ERTAPENEM,58.0,Female,Other,Hispanic or Latino,Known Deceased,,,...,Never,N,Never Smoker,,,No,,,N,
2,00BC388062D068C8E780EE0CAA5C0231,3775,CHLORHEXIDINE,50.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,Never,N,Never Smoker,,,No,,,N,
3,00BC388062D068C8E780EE0CAA5C0231,4198,ERYTHROMYCIN,50.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,Never,N,Never Smoker,,,No,,,N,
4,013E44B08E60B7E0F6EAC9B2E9D84B75,25,PENICILLINS,69.0,Female,White or Caucasian,Not Hispanic or Latino,Not Known Deceased,Physician,,...,Never,N,Never Smoker,,,Yes,.6,Glasses of Wine (5 oz),N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
789,FDD3203F26306B463CF20CBE91F7A696,4906,MORPHINE,58.0,Female,White or Caucasian,Not Hispanic or Latino,Not Known Deceased,Physician,0.5,...,Quit,N,Former Smoker,,01/01/2013,No,,,N,
790,FE32AD45DE5F0D142B65D2478C152B93,4540,IODINE,59.0,Male,White or Caucasian,Not Hispanic or Latino,Not Known Deceased,Physician,,...,Quit,N,Former Smoker,,11/13/2014,No,0,Standard drinks or equivalent,N,
791,FEDE4690B703274458A40C9CAD3B9163,17901,OTHER,64.0,Male,Other,Cuban,Not Known Deceased,Physician,,...,Never,N,Never Smoker,,,No,,,N,
792,FF8887097C23F016E6D26EC19FA44CD8,33,SULFA ANTIBIOTICS,57.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,Never,N,Never Smoker,,,No,,,N,


In [7]:
combined["IP_PATIENT_ID"].nunique()

379

# Load + Preproc Outcomes

In [8]:
# get first sheet only
outcomes = pd.read_csv(f"{data_dir}\\{outcome_file}")

# Exclude pediatric data
#exclude_peds = outcomes["Hospital name"] != "UCLA MEDICAL CENTER- PEDIATRICS"
#outcomes = outcomes[exclude_peds]

outcomes

Unnamed: 0,Month,Hospital name,IP_PATIENT_ID,CRRT Total Days,End Date,Machine,ICU,Recov. renal funct.,Transitioned to HD,Palliative Care,Expired
0,Jan-18,UCLA MEDICAL CENTER-ADULT,3C9BED449D46A07C443AF05A9E5AC12B,8,1/1/18,Prisma,8ICU,,,,1.0
1,Jan-18,UCLA MEDICAL CENTER-ADULT,A97E1EE909333DFF64DB5CA508B2D2B6,3,1/1/18,Prisma,4ICU,,,,1.0
2,Jan-18,UCLA MEDICAL CENTER-ADULT,AE1D1E2C40AB323AE44A4FFD2B152772,4,1/2/18,Prisma,8ICU,,1.0,,
3,Jan-18,UCLA MEDICAL CENTER-ADULT,A694BB308CD43024E33F747A111910D4,4,1/2/18,Prisma,4ICU,,,,1.0
4,Jan-18,UCLA MEDICAL CENTER-ADULT,6FE37490810C185DEE42439E9BA75D81,12,1/3/18,NxStage,4ICU,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
802,Mar-19,UCLA MEDICAL CENTER-ADULT,55564CCF54119A1538CCADE0B3EC32C3,4,3/28/19,NxStage,6ICU,,,1.0,
803,Mar-19,UCLA MEDICAL CENTER-ADULT,B33B53F9E3F728893B074B3EFED16746,10,3/28/19,Prisma,8ICU,,1.0,,
804,Mar-19,UCLA MEDICAL CENTER-ADULT,2441ED58C95D46AE7A537BDF3CACDC48,4,3/29/19,NxStage,4ICU,,,,1.0
805,Mar-19,SANTA MONICA - UCLA MEDICAL CENTER,4506A0A6658890268C81CFBF49F8297A,5,3/29/19,Prisma,4CWICU,,1.0,,


# Validate Outcomes

In [9]:
positive_outcomes = ["Recov. renal funct.", "Transitioned to HD"]
negative_outcomes = ["Palliative Care", "Expired "] 
outcome_cols = positive_outcomes + negative_outcomes
outcomes[outcome_cols]

Unnamed: 0,Recov. renal funct.,Transitioned to HD,Palliative Care,Expired
0,,,,1.0
1,,,,1.0
2,,1.0,,
3,,,,1.0
4,1.0,,,
...,...,...,...,...
802,,,1.0,
803,,1.0,,
804,,,,1.0
805,,1.0,,


In [10]:
# Each row should have exactly 1 1.0 value (one-hot of the 4 cols)
bad_rows = outcomes[outcome_cols].fillna(0).sum(axis=1) == 0
outcomes[bad_rows]
## TODO: Should i drop the bad row?

Unnamed: 0,Month,Hospital name,IP_PATIENT_ID,CRRT Total Days,End Date,Machine,ICU,Recov. renal funct.,Transitioned to HD,Palliative Care,Expired
590,Nov-18,UCLA MEDICAL CENTER-ADULT,DCE063F946C81CE5181A3114565C9341,4,11/28/18,,8ICU,,,,


# Construct outcome feature (recommend dialysis)

In [11]:
recommend_dialysis = (outcomes["Recov. renal funct."] == 1) | (outcomes["Transitioned to HD"] == 1)
outcomes["recommend_dialysis"] =  recommend_dialysis.astype(int)

# To combine with features
outcome_df = outcomes[["IP_PATIENT_ID", "recommend_dialysis"]]

In [12]:
sum(outcome_df["recommend_dialysis"])/len(outcome_df) * 100

53.03593556381661

# Merge features with outcome

In [13]:
features_with_outcomes = pd.merge(combined, outcome_df, on="IP_PATIENT_ID", how="inner")
features_with_outcomes

Unnamed: 0,IP_PATIENT_ID,ALLERGEN_ID,DESCRIPTION,AGE,GENDER,RACE,ETHNICITY,VITAL_STATUS,PCP_PROVIDER_TYPE,TOBACCO_PAK_PER_DY,...,CIGARETTES_YN,SMOKING_TOB_STATUS,SMOKING_START_DATE,SMOKING_QUIT_DATE,ALCOHOL_USER,ALCOHOL_OZ_PER_WK,ALCOHOL_TYPE,IV_DRUG_USER_YN,ILLICIT_DRUG_FREQ,recommend_dialysis
0,00A2C5946EDDAB0737D3E58C6E9919EC,32829,UNKNOWN,69.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,N,Never Smoker,,,No,,,N,,0
1,00BB78DEA4FD324D7AAAC03E361BA345,3263,ERTAPENEM,58.0,Female,Other,Hispanic or Latino,Known Deceased,,,...,N,Never Smoker,,,No,,,N,,0
2,00BC388062D068C8E780EE0CAA5C0231,3775,CHLORHEXIDINE,50.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,N,Never Smoker,,,No,,,N,,0
3,00BC388062D068C8E780EE0CAA5C0231,4198,ERYTHROMYCIN,50.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,N,Never Smoker,,,No,,,N,,0
4,02944A79E6B98ED66AD1F41EA0720D52,1872,BRASSICA OLERACEA ITALICA,55.0,Female,Other,Hispanic or Latino,Not Known Deceased,Physician,,...,N,Never Smoker,,,No,0,Standard drinks or equivalent,N,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,FDD3203F26306B463CF20CBE91F7A696,3516,AVOCADO,58.0,Female,White or Caucasian,Not Hispanic or Latino,Not Known Deceased,Physician,0.5,...,N,Former Smoker,,01/01/2013,No,,,N,,1
487,FDD3203F26306B463CF20CBE91F7A696,4906,MORPHINE,58.0,Female,White or Caucasian,Not Hispanic or Latino,Not Known Deceased,Physician,0.5,...,N,Former Smoker,,01/01/2013,No,,,N,,1
488,FEDE4690B703274458A40C9CAD3B9163,17901,OTHER,64.0,Male,Other,Cuban,Not Known Deceased,Physician,,...,N,Never Smoker,,,No,,,N,,1
489,FF8887097C23F016E6D26EC19FA44CD8,33,SULFA ANTIBIOTICS,57.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,Physician,,...,N,Never Smoker,,,No,,,N,,0


# Feature Engineering

## Fill in Missing Values

In [14]:
# TODO: mean should be done on the training set
features_with_outcomes['AGE'].fillna(features_with_outcomes['AGE'].mean(), inplace=True)
features_with_outcomes['TOBACCO_PAK_PER_DY'].fillna(0, inplace=True)
features_with_outcomes['TOBACCO_USED_YEARS'].fillna(0, inplace=True)
features_with_outcomes['TOBACCO_USER'].fillna('Never', inplace=True)
features_with_outcomes['ALCOHOL_USER'].fillna('No', inplace=True)
features_with_outcomes['ALCOHOL_OZ_PER_WK'].fillna(0, inplace=True)
features_with_outcomes['ILLICIT_DRUG_FREQ'].fillna(0, inplace=True)

In [15]:
# convert ALCOHOL_OZ_PER_WK to numeric
def alc_freq_to_numeric(x):
    if x == "0":
        return 0
    if x == "3.6 - 4.2":
        return 3.9
    if x == ".6":
        return .6
    if x == "3.6":
        return 3.6
    if x == "1.8 - 3":
        return 2.4
    if x == "1.8":
        return 1.8
    if x == "2.4":
        return 2.4
    if x == "6":
        return 6
    if x == "8.4":
        return 8.4
    if x == ".6 - 1.2":
        return 0.8
    if x == "12.6":
        return 0.0
    if x == 0:
        return 0
    else:
        raise ValueError("Invalid entry: {}".format(x))
features_with_outcomes['ALCOHOL_OZ_PER_WK'] = features_with_outcomes['ALCOHOL_OZ_PER_WK'].apply(alc_freq_to_numeric)

## One-Hot Encoding

In [38]:
enc = OneHotEncoder(handle_unknown='ignore')
cat_features = features_with_outcomes[['GENDER', 'RACE', 
                                      'ETHNICITY', 'PCP_PROVIDER_TYPE',
                                      'TOBACCO_USER',
                                      'CIGARETTES_YN',
                                      'SMOKING_TOB_STATUS',
                                      'ALCOHOL_USER',
                                      'ALCOHOL_TYPE',
                                      'IV_DRUG_USER_YN', 'ALLERGEN_ID']].to_numpy()
OH_features = enc.fit_transform(cat_features).toarray()

In [42]:
#enc.categories_

In [39]:
sum(OH_features[:,0]), sum(OH_features[:,1]), sum(OH_features[:,2])

(269.0, 222.0, 13.0)

In [43]:
OH_features.shape

(491, 209)

## Generate Feature Vector

In [27]:
real_features = features_with_outcomes[['AGE', 'TOBACCO_PAK_PER_DY', 'TOBACCO_USED_YEARS', 'ALCOHOL_OZ_PER_WK',
                                   'ILLICIT_DRUG_FREQ']].to_numpy()
real_features.shape

(491, 5)

In [29]:
features = np.concatenate((OH_features, real_features), axis=1)
features.shape

(491, 214)

In [30]:
targets = features_with_outcomes['recommend_dialysis'].to_numpy()

# Classification Models (using CV)

In [31]:
# TODO: split by patient id

In [32]:
skf = StratifiedKFold(n_splits=5)

In [55]:
aucrocs = []
accs = []
f1_scores = []
for train_index, test_index in skf.split(features, targets):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

    #clf = LogisticRegression()
    #clf = SVC(probability=True)
    #clf = DecisionTreeClassifier()
    #clf = RandomForestClassifier()
    #clf = KNeighborsClassifier(3)
    #clf = MLPClassifier(alpha=1, max_iter=1000)
    #clf = GaussianNB()
    clf = MultinomialNB()
    #clf = AdaBoostClassifier()
    #clf = QuadraticDiscriminantAnalysis()
    #clf = GaussianProcessClassifier(1.0 * RBF(1.0))
    clf.fit(X_train, y_train)
    pred_probs = clf.predict_proba(X_test)[:,1]
    aucroc = roc_auc_score(y_test, pred_probs)
    acc = accuracy_score(y_test, np.round(pred_probs))
    f1_score_ = f1_score(y_test, np.round(pred_probs))
    print("aucroc: {}, acc: {}, f1_score: {}".format(aucroc, acc, f1_score_))
    aucrocs.append(aucroc)
    accs.append(acc)
    f1_scores.append(f1_score_)
print("mean -- aucroc: {}, acc: {}, f1_score: {}".format(np.mean(aucrocs), np.mean(accs), np.mean(f1_scores)))

aucroc: 0.5497076023391813, acc: 0.5454545454545454, f1_score: 0.6218487394957983
aucroc: 0.5682397959183674, acc: 0.6020408163265306, f1_score: 0.7194244604316548
aucroc: 0.6269132653061225, acc: 0.6224489795918368, f1_score: 0.6991869918699187
aucroc: 0.519469405220368, acc: 0.5612244897959183, f1_score: 0.6993006993006993
aucroc: 0.4602053915275995, acc: 0.5306122448979592, f1_score: 0.6617647058823529
mean -- aucroc: 0.5449070920623277, acc: 0.5723562152133581, f1_score: 0.6803051193960847
