In [None]:
import pandas as pd
import numpy as np
import pickle
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import sklearn

In [None]:
def clean_outliers(data):
    cols = list(data)
    for col in cols:
        min_value = data[col].quantile(0.01)
        max_value = data[col].quantile(0.99)
        data[col][data[col] < min_value] = None
        data[col][data[col] > max_value] = None
    return data

def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [None]:
obj_feature_name = ['SOFA_score', 'Respiration_score', 'Coagulation_score', 'Liver_score',
                     'Cardiovascular_score','CNS_score', 'Renal_score',
             'CRP', 'Temperature', 'WBC', 'SO2', 'Pao2', 'Respiratory_rate',
            'Heart_rate', 'Lactate', 'Systolic_ABP','BUN', 'Creatinine',
            'ALT', 'AST', 'Bilirubin','GCS','Hemoglobin', 'INR', 'Platelet',
            'Chloride', 'Glucose', 'Sodium', 'BMI',
            'Age']

#### Loading in Data

In [None]:
df_steriods = pd.read_csv("data/eICU/steroids.csv")
df_steriods['patientunitstayid'] = df_steriods['patientunitstayid'].astype(int)
df_features = pd.read_excel("data/eICU/feature.xlsx")
df_patient = pd.read_csv("data/eICU/patient.csv")
df_sepsis = pd.read_csv("data/eICU/sepsis.csv")

df_comorb = pd.read_csv("data/eICU/patient_comorbidity_score_df.csv")
df_sofa = pd.read_csv("data/eICU/eICU_sofa.csv")
df_sofa_comps = pd.read_csv("data/eICU/eICU_SOFA_score_comps.csv")

df_features = pd.merge(df_features, df_comorb[['patientunitstayid', 'comorbidity_score']], on='patientunitstayid',
                        how='left')
df_features = pd.merge(df_features, df_sofa[['patientunitstayid', 'sofa']], on='patientunitstayid',
                        how='left')
df_features = pd.merge(df_features, df_sofa_comps, on='patientunitstayid',
                        how='left')

sofa_comps = ['Respiration_score', 'Cardiovascular_score', 'CNS_score', 'Liver_score', 'Coagulation_score', 'Renal_score']
df_features[sofa_comps] = df_features[sofa_comps].fillna(0)
df_features['sofa'] = df_features['Respiration_score'] + \
                        df_features['Coagulation_score'] + \
                            df_features['Liver_score'] + \
                            df_features['Cardiovascular_score'] + \
                            df_features['CNS_score'] + \
                            df_features['Renal_score']

sepsis_patients = df_sepsis[df_sepsis['sepsis_onset'] <= 1440]['patientunitstayid'].values

#### Processing Data

In [None]:
# Initial Preprocessing
time_feature_names = ['Albumin', 'ALT', 'AST', 'Bands', 'Bicarbonate', 'Bilirubin',
                    'BUN','Chloride', 'Creatinine', 'CRP', 'FiO2', 'GCS', 'Glucose', 'Heart_rate',
                    'Hemoglobin', 'INR', 'Lactate', 'Lymphocyte_percent', 'MAP',
                    'PaO2', 'Platelet', 'RDW', 'Lymphocyte_count', 'Respiratory_rate','SO2', 'Sodium','Systolic_ABP',
                    'Temperature', 'Troponin I', 'Troponin T', 'Urine', 'WBC']
baseline_feature_names = ['age', 'gender', 'BMI', 'comorbidity_score', 'sofa', 'Respiration_score',
                            'Cardiovascular_score', 'CNS_score', 'Liver_score',
                            'Coagulation_score', 'Renal_score']
base_tv_point_features = [feat + '_day_' + str(1) for feat in time_feature_names]
df_feat_proc = df_features[['patientunitstayid'] + base_tv_point_features + baseline_feature_names]
df_feat_proc.columns = ['patientunitstayid'] + time_feature_names + baseline_feature_names
df_feat_proc = pd.merge(df_feat_proc, df_patient[['patientunitstayid', 'ethnicity']], on='patientunitstayid')
df_feat_proc.rename(columns={'comorbidity_score':'Comorbidity_score', 'age':'Age', 'PaO2':'Pao2', 'sofa':'SOFA_score'}, inplace=True)

# Replacing Values
df_feat_proc = df_feat_proc[['patientunitstayid'] + obj_feature_name]

df_feat_proc = df_feat_proc[df_feat_proc['patientunitstayid'].isin(sepsis_patients)]
df_feat_proc.replace([np.inf, -np.inf], np.nan, inplace=True)

# Clean outliers
df_feat_proc[intersection(time_feature_names+['age', 'BMI'], obj_feature_name)] = clean_outliers(df_feat_proc[intersection(time_feature_names+['age', 'BMI'], obj_feature_name)])

# Feature Normalization
features_to_transform = []
uniform_features = []
for f in obj_feature_name:
    if len(pd.unique(df_feat_proc[f])) == 1:
        uniform_features.append(f)
    else:
        features_to_transform.append(f)

df_feat_proc_orig = df_feat_proc.copy(deep=True)

df_feat_proc.replace([np.inf, -np.inf], np.nan, inplace=True)
df_feat_proc = df_feat_proc.fillna(df_feat_proc.median())

In [None]:
with open("data/Subtyping/RI_simplified.pkl", 'rb') as handle:
    ri_model = pickle.load(handle)
with open("data/Subtyping/RW_simplified.pkl", 'rb') as handle:
    rw_model = pickle.load(handle)

with open("data/Subtyping/RI_ori.pkl", 'rb') as handle:
    ri_model_rf = pickle.load(handle)
with open("data/Subtyping/RW_ori.pkl", 'rb') as handle:
    rw_model_rf = pickle.load(handle)

In [None]:
# Get predictions
ri_thresh = 0.50
rw_thresh = 0.50

if type(ri_model) == sklearn.pipeline.Pipeline:
    C_1_result = ri_model.predict_proba(df_feat_proc[obj_feature_name].values)[:, 1]
    C_3_result = rw_model.predict_proba(df_feat_proc[obj_feature_name].values)[:, 1]
else:
    C_1_result = ri_model.predict(df_feat_proc[obj_feature_name].values)
    C_3_result = rw_model.predict(df_feat_proc[obj_feature_name].values)

C_1_result_rf = ri_model_rf.predict_proba(df_feat_proc[obj_feature_name].values)[:, 1]
C_3_result_rf = rw_model_rf.predict_proba(df_feat_proc[obj_feature_name].values)[:, 1]

remove_overlap = 1
if remove_overlap:
    C_1_result[((C_1_result > ri_thresh) & (C_3_result > rw_thresh))] = -1
    C_3_result[(C_1_result == -1)] = 0
    C_1_result_rf[((C_1_result_rf > ri_thresh) & (C_3_result_rf > rw_thresh))] = -1
    C_3_result_rf[(C_1_result_rf == -1)] = 0

In [None]:
df_subtypes = pd.DataFrame()
df_subtypes['patientunitstayid'] = df_feat_proc['patientunitstayid']
df_subtypes['RI'] = (C_1_result > ri_thresh)
df_subtypes['RW'] = (C_3_result > rw_thresh)
df_subtypes['subtype'] = 0
df_subtypes.loc[df_subtypes.RI == True, 'subtype'] = 1
df_subtypes.loc[df_subtypes.RW == True, 'subtype'] = 3

In [None]:
df_subtypes.to_csv("data/eICU/zxu_subtypes_lr_24h.csv", index=False)