In [28]:
# Class for Outlier Detection in Lab and Vital Data
# Kaiser Permanente ©
# Created by: Suraj Anand and Ryan Apfel
# Date: 07/22/2019

# Hope this comes in handy!!

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from mpl_toolkits.mplot3d import Axes3D


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from pyod.models.iforest import IForest
from pyod.models.auto_encoder import AutoEncoder

from itertools import compress
from scipy import stats
from scipy.spatial import distance


In [29]:
def clean_gfr_row(x):
    if type(x['ORD_VALUE']) == str:
        val = x['ORD_VALUE'].split('-')[0]
        return pd.to_numeric((val.split('>')[-1]), errors='coerce')
    else:
        val = x['ORD_VALUE']
        return pd.to_numeric(val, errors='coerce')
    

def clean_urinalysis(x):
    val = str(x['ORD_VALUE']).lower()
    if 'neg' in val:
        return 0
    elif '1+' in val:
        return 1
    elif '2+' in val:
        return 2
    elif '3+' in val:
        return 3
    elif '4+' in val:
        return 4
    else:
        return np.nan

def clean_nitrite(x):
    val = str(x['ORD_VALUE']).lower()
    if 'neg' in val:
        return 0
    elif 'pos' in val:
        return 1
    else:
        return np.nan

def clean_microscopic(x):
    val = str(x['ORD_VALUE']).lower()
    if 'to follow' in val:
        return 0
    elif 'not indicated' in val:
        return 1

def clean_wbc(x):
    val = str(x['ORD_VALUE']).lower()
    if val == '0':
        return 0
    elif '0-2' in val:
        return 1
    elif '2-5' in val:
        return 2
    elif '5-10' in val or '6-10' in val:
        return 3
    elif '25-50' in val or '11-15' in val or '10-20' or '31-40' in val or '16-20' in val:
        return 4
    elif '50-100' in val or '60-100' in val or '40-60' in val:
        return 5
    elif '>100' in val:
        return 6

def clean_mucus(x):
    val = str(x['ORD_VALUE']).lower()
    if val == '0':
        return 0
    elif 'present' in val:
        return 1
    elif '1+' in val:
        return 2
    elif '2+' in val or '6-10' in val:
        return 3
    elif '3+' in val:
        return 4
    elif '4+' in val:
        return 5

def clean_bacteria(x):
    val = str(x['ORD_VALUE']).lower()
    if val == '0' or 'none' in val or 'neg' in val:
        return 0
    elif 'present' in val or 'trace' in val or 'few' in val:
        return 1
    elif 'mod' in val:
        return 2
    elif '1+' in val or '2+' in val or '3+' in val or '4+' in val:
        return 3


def clean_binary(x):
    val = str(x['ORD_VALUE']).lower()
    neg = {'neg', 'negative','absent','not indicated','0','none','(neg)','normal'}
    pos = {'pos', 'positive','to follow','trace','occ','mod','1+','2+','3+','4+','few'}
    if val in neg or 'neg' in val:
        return 0
    elif val in pos or '1+' in val or '2+' in val or '3+' in val or '4+' in val:
        return 1
    else:
        return np.nan

class Clean_Labs:
    def __init__(self, data):
        self.data = data
        
    def clean_gfr(self):
        self.data.loc[self.data.COMPONENT_NAME == 'GLOMERULAR FILTRATION RATE', 'ORD_VALUE'] = self.data.loc[self.data.COMPONENT_NAME == 'GLOMERULAR FILTRATION RATE' ].apply(clean_gfr_row, axis=1)
            
    def get_numeric_percent(self, component='CREATININE'):
        return len(pd.to_numeric(self.data.loc[self.data.COMPONENT_NAME == component, 'ORD_VALUE'], errors='coerce').dropna())/len((self.data.loc[self.data.COMPONENT_NAME == component, 'ORD_VALUE']))

    def get_nonnumeric_inputs(self, component='CREATININE'):
        return self.data.loc[self.data.COMPONENT_NAME== component,'ORD_VALUE'].loc[self.data.loc[self.data.COMPONENT_NAME==component,'ORD_VALUE'].str.contains('[^0-9^\.]')].value_counts()
    
    def basic_clean(self):
        
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'[Nn]egative',value=r'0')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'NEGATIVE',value=r'0')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'NEG',value=r'0')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'NORMAL',value=r'0')

        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'[Pp]ositive',value=r'1')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'POSITIVE',value=r'1')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'FEW',value=r'1')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'TRACE',value=r'1')

        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'[><]',value=r'')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'\(.+\)',value=r'')
        self.data.loc[self.data.PROC_NAME !='URINALYSIS, AUTOMATED', 'ORD_VALUE'].replace(regex=True,inplace=True,to_replace=r'[=\-\*\:\,]+',value=r'')


    def urinalysis_to_categorical(self):
        urinalysis = self.data.loc[labs.PROC_NAME == 'URINALYSIS, AUTOMATED', :]

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'MICROSCOPIC EXAM, URINE', 'ORD_VALUE' ] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'MICROSCOPIC EXAM, URINE', : ].apply(clean_microscopic, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'WBC\'S, UA/HPF', 'ORD_VALUE' ] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'WBC\'S, UA/HPF', : ].apply(clean_wbc, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'RBC, URINE HPF', 'ORD_VALUE' ] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'RBC, URINE HPF', : ].apply(clean_wbc, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'NITRITE, UA', 'ORD_VALUE'] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'NITRITE, UA'].apply(clean_nitrite, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'KETONES, UA', 'ORD_VALUE'] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'KETONES, UA'].apply(clean_urinalysis, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'UROBILINOGEN, UA, QL', 'ORD_VALUE'] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'UROBILINOGEN, UA, QL',:].apply(clean_urinalysis, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'BILIRUBIN, UA', 'ORD_VALUE'] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'BILIRUBIN, UA',:].apply(clean_urinalysis, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'UA HGB', 'ORD_VALUE'] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'UA HGB',:].apply(clean_urinalysis, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'BACTERIA, URINE HPF', 'ORD_VALUE'] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'BACTERIA, URINE HPF',:].apply(clean_bacteria, axis=1)

        urinalysis.loc[urinalysis.COMPONENT_NAME == 'MUCUS, URINE', 'ORD_VALUE' ] = \
        urinalysis.loc[urinalysis.COMPONENT_NAME == 'MUCUS, URINE', : ].apply(clean_bacteria, axis=1)

        self.data.loc[labs.PROC_NAME == 'URINALYSIS, AUTOMATED', :] = urinalysis


    def urinalyisis_to_binary(self):
        urinalysis = self.data.loc[labs.PROC_NAME == 'URINALYSIS, AUTOMATED', :]

        test_comp = urinalysis.COMPONENT_NAME.unique()
        for i in test_comp:
            urinalysis.loc[urinalysis.COMPONENT_NAME == i, 'BINARY_ORD_VALUE'] = urinalysis.loc[urinalysis.COMPONENT_NAME == i, :].apply(clean_binary, axis=1)

        self.data.loc[labs.PROC_NAME == 'URINALYSIS, AUTOMATED', :] = urinalysis


        

In [1]:

def bmi(row):
    return row['WEIGHT'] / 16 * 703 / row['HEIGHT'] ** 2
    
def inches_conversion(row):
    if row['HEIGHT'] == row['HEIGHT']:
        chars = row['HEIGHT'].split('\' ')
        return float(chars[0]) * 12 + float(chars[1].split('\"')[0])
    else:
        return np.nan
    
class Outlier_Detection_Vitals:
    def __init__(self, data):
        self.data = data
    
   
    def clean_BMI(self, outpatient=False):
       
        if outpatient:
            self.data.loc[(self.data.WEIGHT.notnull()) & (self.data.HEIGHT.notnull()) & (self.data.HEIGHT != 0) & (self.data.WEIGHT != 0), 'BMI'] = self.data.loc[(self.data.WEIGHT.notnull()) & (self.data.HEIGHT.notnull()) & (self.data.HEIGHT != 0) & (self.data.WEIGHT != 0)].apply(bmi, axis=1)
        else:
            self.data.loc[(self.data.BMI.isnull()) & (self.data.WEIGHT.notnull()) & (self.data.HEIGHT.notnull()), 'BMI'] = self.data.loc[(self.data.BMI.isnull()) & (self.data.WEIGHT.notnull()) & (self.data.HEIGHT.notnull())].apply(bmi, axis=1)

            
    def clean_height(self, outpatient=False):
        if outpatient:
            self.data['HEIGHT'] = self.data.apply(inches_conversion, axis=1)
            
            
    # creates column MED_INFEASE for outlier detection method 1
    def detect_medically_infeasible(self):
        medically_infeasable = {
         'TEMPERATURE':(89.6,110),
         'BP_SYSTOLIC':(20,260),
         'BP_DIASTOLIC':(10,160),
         'RESPIRATIONS':(6,100),
         'PULSE_OXIMETRY':(0,101),
         'PULSE':(30,250),
         'WEIGHT':(1,14000),
         'HEIGHT':(3,100)
         }
        self.data['MED_INFEASE'] = 0
        
        count_outliers = 0
        columns = list(self.data.columns)
        
        for c in ['TEMPERATURE', 'BP_SYSTOLIC', 'BP_DIASTOLIC', 'RESPIRATIONS', 'PULSE_OXIMETRY', 'PULSE', 'WEIGHT', 'HEIGHT']:
            if c in medically_infeasable:
                count_outliers = count_outliers + sum(self.data[c] < medically_infeasable.get(c)[0]) + sum(self.data[c] > medically_infeasable.get(c)[1])
                self.data['MED_INFEASE'][self.data[c] < medically_infeasable.get(c)[0]] = 1 # minimum
                self.data['MED_INFEASE'][self.data[c] > medically_infeasable.get(c)[1]] = 1 # maximum
        self.data['MED_INFEASE'] = self.data['MED_INFEASE'].astype(bool)
        return count_outliers
    
    # creates column STD_OUTLIER for outlier detection method 2
    def detect_outliers_std(self, id_col='PAT_NUM'):
        count_outliers = 0
        for test in [ 'BP_SYSTOLIC', 'BP_DIASTOLIC', 'RESPIRATIONS',  'PULSE', 'WEIGHT', 'HEIGHT']:
            self.data[test + '_OUTLIER'] = self.data[test].dropna()[(np.abs(stats.zscore(self.data[test].dropna())) > 3)] 
            self.data[test + '_OUTLIER'][self.data[test + '_OUTLIER'].notna()] = 1
            #gets the outliers (=1 in the test + _OUTLIER column)

            morethanone = self.data[self.data[(test + '_OUTLIER')].notna()].groupby(id_col)[test + '_OUTLIER'].sum() >=2
            ids = np.array(morethanone.index)

            ids = list(compress(ids, morethanone))
            #gets the list of patient ids that occur multiple times among the outliers (>=2)

            self.data.loc[self.data[test + '_OUTLIER'].notna() & self.data[id_col].isin(ids), test + '_OUTLIER'] = 0
            self.data[test + '_OUTLIER'] = self.data[test + '_OUTLIER'].fillna(0)
            #sets null values and outliers among patients occuring multiple times to 0, and all assumed true outliers to 1

            count_outliers = count_outliers + sum(self.data[test + '_OUTLIER'])
            #df[test][df[test + '_OUTLIER'] == 1] = np.nan
            
        self.data['STD_OUTLIER'] = (self.data[['BP_SYSTOLIC_OUTLIER',
                'BP_DIASTOLIC_OUTLIER', 'RESPIRATIONS_OUTLIER', 'PULSE_OUTLIER',
                'WEIGHT_OUTLIER', 'HEIGHT_OUTLIER']].sum(axis=1) >= 1)
        
        self.data =  self.data.drop(['BP_SYSTOLIC_OUTLIER',
                'BP_DIASTOLIC_OUTLIER', 'RESPIRATIONS_OUTLIER', 'PULSE_OUTLIER',
                'WEIGHT_OUTLIER', 'HEIGHT_OUTLIER'], axis=1)
        
        return self.data['STD_OUTLIER'].sum()

# keeps median imputation in the dataframe
# creates column PCA_Clust_Outlier for outlier detection method 3
    def detect_outliers_pca_clust(self, id_col='PAT_NUM', dist_thresh=7, plot_pca=True, plot_clusters_2d=True, plot_dist_cum=False):
        vitals_data = self.data[['AGE', 'SEX_C', 'TEMPERATURE', 'BP_SYSTOLIC', 'BP_DIASTOLIC', 'RESPIRATIONS', 'PULSE']]

        imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
        vitals_imputed = pd.DataFrame(imp_mean.fit_transform(vitals_data))
        vitals_imputed.columns = vitals_data.columns
        vitals_dat = StandardScaler().fit_transform(vitals_imputed)
        
        
        pca = PCA(n_components=3)
        principalComponents = pca.fit_transform(vitals_dat)
        print("EXPLAINED VARIANCE RATIO: ", sum(pca.explained_variance_ratio_))
        
        pca_results = pd.DataFrame(principalComponents, columns=['p1', 'p2', 'p3'])
        
        if plot_pca:
            fig = plt.figure()
            ax = Axes3D(fig)

            f1 = pca_results.p1
            f2 = pca_results.p2
            f3 = pca_results.p3

            ax.scatter(f1, f2, f3)
            plt.show()
        
        kmeans = KMeans(n_clusters=3).fit(pca_results)
        y_means = kmeans.predict(pca_results)

        if plot_clusters_2d:
            plt.scatter(pca_results.p1, pca_results.p2)

            centers = kmeans.cluster_centers_
            plt.scatter(centers[:,0], centers[:,1], c='black', s=200, alpha=0.5)
            

        pca_results['Dist_closest'] = pca_results.apply(lambda x: min(distance.euclidean(tuple(x), tuple(centers[:, 0])),  distance.euclidean(tuple(x), tuple(centers[:, 1])), distance.euclidean(tuple(x), tuple(centers[:, 2]))), axis = 1)

        if plot_dist_cum:
            sns.distplot(pca_results['Dist_closest'], color='g', hist_kws=dict(cumulative=True),
             kde_kws=dict(cumulative=True))
            
        self.data['PCA_Clust_Outlier'] = False
        self.data['PCA_Clust_Outlier'][pca_results['Dist_closest'] > 7] = True
        return self.data['PCA_Clust_Outlier'].sum() 

    
    # creates column IForest_Outlier for outlier detection method 4
    def detect_outliers_iforest(self, outliers_fraction=0.01, random_state=42):
        clf = IForest(contamination=outliers_fraction,random_state=random_state)
        
        vitals_data = self.data[['AGE', 'SEX_C',
               'TEMPERATURE', 'BP_SYSTOLIC', 'BP_DIASTOLIC', 'RESPIRATIONS',
               'PULSE_OXIMETRY', 'PULSE']]

        imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
        vitals_imputed = pd.DataFrame(imp_mean.fit_transform(vitals_data))
        vitals_imputed.columns = vitals_data.columns
        vitals_dat = StandardScaler().fit_transform(vitals_imputed)
        
        clf.fit(vitals_dat)
        
        y_pred = clf.predict(vitals_dat)
       
        self.data['IForest_Outlier'] = y_pred.astype(bool)
        return self.data['IForest_Outlier'].sum() 
    
    # creates column AE_Outlier for outlier detection method 5
    def detect_outliers_autoencoder(self, hidden_neurons=[64, 32, 3, 32, 64], outliers_fraction=0.01, random_state=42, epochs=7):
        clf = AutoEncoder(hidden_neurons=hidden_neurons, contamination=outliers_fraction,random_state=random_state, epochs=epochs)
        
        
        vitals_data = self.data[['AGE', 'SEX_C',
               'TEMPERATURE', 'BP_SYSTOLIC', 'BP_DIASTOLIC', 'RESPIRATIONS',
               'PULSE_OXIMETRY', 'PULSE']]

        imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
        vitals_imputed = pd.DataFrame(imp_mean.fit_transform(vitals_data))
        vitals_imputed.columns = vitals_data.columns
        vitals_dat = StandardScaler().fit_transform(vitals_imputed)
        
        clf.fit(vitals_dat)
        
        y_pred = clf.predict(vitals_dat)
        
        
       
        self.data['AE_Outlier'] = y_pred.astype(bool)
        return self.data['AE_Outlier'].sum() 
    
##Other out of the box OD algorithms available in the Pyod package

    def categoricalize_age(self):
        self.data.loc[self.data.AGE < 2, "AGE_GROUP"] = "INFANT"
        self.data.loc[(self.data.AGE >= 2) & (self.data.AGE < 65), "AGE_GROUP"] = "ADULT"
        self.data.loc[self.data.AGE >= 65, "AGE_GROUP"] = "ELDERLY"
        
        print(self.data.AGE_GROUP.value_counts())
     

    