<h1>Miscellanous Experiments on the Cohort</h1>
- Autopsy Rates
- Table One
- Pairwise Correlations Between Mistrust Scores and OASIS
- Sentiment Disparities, Stratified by Mistrust OR Race

In [41]:
import cPickle as pickle
import numpy as np
import pandas as pd
import psycopg2
from time import strftime, gmtime
import tqdm

In [42]:
# create a database connection
sqluser = 'wboag'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to the database
con = psycopg2.connect(dbname=dbname, user=sqluser, host="/var/run/postgresql")

<h1>Load Data</h1>

In [43]:
def normalize_race(race):
    if 'HISPANIC' in race:
        return 'Hispanic'
    if 'SOUTH AMERICAN' in race:
        return 'Hispanic'
    if 'AMERICAN INDIAN' in race:
        return 'Native American'
    if 'ASIAN' in race:
        return 'Asian'
    if 'BLACK' in race:
        return 'Black'
    if 'UNKNOWN/NOT SPECIFIED' in race:
        return 'Not Specified'
    if 'WHITE' in race:
        return 'White'
    #print race
    return 'Other'

def normalize_insurance(ins):
    if ins in ['Government', 'Medicaid', 'Medicare']:
        return 'Public'
    elif ins == 'Private':
        return 'Private'
    else:
        return 'Self-Pay'
    
def normalize_discharge(disch):
    if disch.startswith('HOSPICE'):
        return 'Hospice'
    if disch == 'DEAD/EXPIRED':
        return 'Deceased'
    if disch.startswith('SNF'):
        return 'Skilled Nursing Facility'
    return 'other'

def normalize_age(age):
    return min(age, 90)

In [44]:
demographics_query = 'SELECT distinct subject_id,hadm_id,gender,age,ethnicity FROM mimiciii.icustay_detail;' 
demographics = pd.read_sql_query(demographics_query, con)
demographics.head()

Unnamed: 0,subject_id,hadm_id,gender,age,ethnicity
0,86220,185470,M,55.7428,WHITE
1,75,112086,F,76.7719,WHITE
2,558,104958,M,50.3831,WHITE
3,52800,134778,M,75.8225,WHITE
4,16967,137330,M,78.101,UNKNOWN/NOT SPECIFIED


In [45]:
# admissions info
print strftime("%Y-%m-%d %H:%M:%s")
discharge_query = '''SELECT distinct subject_id,hadm_id,ethnicity,insurance,discharge_location,admittime,dischtime
  FROM mimiciii.admissions;'''
discharge = pd.read_sql_query(discharge_query, con)

discharge['discharge_location'] = discharge['discharge_location'].apply(normalize_discharge)
print strftime("%Y-%m-%d %H:%M:%s")

2018-07-13 17:24:1531517043
2018-07-13 17:24:1531517043


In [46]:
# EOL Cohort

print strftime("%Y-%m-%d %H:%M:%s")

# patients who died or went to hospice
#eol_locations = {'Hospice', 'Deceased'}
eol_locations = {'Hospice', 'Deceased', 'Skilled Nursing Facility'}
disch = discharge.loc[discharge['discharge_location'].isin(eol_locations)]

ids =  set(disch.hadm_id.values)
eol_cohort_initial = discharge.loc[discharge.hadm_id.isin(ids)]

inds_at_least_6hrs = eol_cohort_initial['dischtime'] - eol_cohort_initial['admittime'] > pd.Timedelta(days=1)
eol_cohort_initial = eol_cohort_initial.loc[inds_at_least_6hrs]


# add demographics info
eol_cohort = pd.merge(eol_cohort_initial, demographics, on=['hadm_id','ethnicity'])
eol_cohort = eol_cohort.rename(columns={'ethnicity':'race'})

# normalize columns of data
eol_cohort['race'              ] = eol_cohort['race'              ].apply(normalize_race)
eol_cohort['insurance'         ] = eol_cohort['insurance'         ].apply(normalize_insurance)
eol_cohort['age'               ] = eol_cohort['age'               ].apply(normalize_age)

los = eol_cohort['dischtime'] - eol_cohort['admittime']
eol_cohort['los'] = los.apply(lambda t:t.seconds/3600.)

# make sure each hadm_id has only died once
assert len(eol_cohort) == len(set(eol_cohort['hadm_id'].values))
print 'eol subjects:', len(set(eol_cohort['hadm_id'].values))

print strftime("%Y-%m-%d %H:%M:%s")

eol_cohort.head()

2018-07-13 17:24:1531517043
eol subjects: 12892
2018-07-13 17:24:1531517043


Unnamed: 0,subject_id_x,hadm_id,race,insurance,discharge_location,admittime,dischtime,subject_id_y,gender,age,los
0,3,145834,White,Public,Skilled Nursing Facility,2101-10-20 19:08:00,2101-10-31 13:58:00,3,M,76.5268,18.833333
1,9,150750,Not Specified,Public,Deceased,2149-11-09 13:06:00,2149-11-14 10:15:00,9,M,41.7902,21.15
2,12,112213,White,Public,Deceased,2104-08-07 10:15:00,2104-08-20 02:57:00,12,M,72.3724,16.7
3,21,111970,White,Public,Deceased,2135-01-30 20:50:00,2135-02-08 02:08:00,21,M,87.8263,5.3
4,31,128652,White,Public,Deceased,2108-08-22 23:27:00,2108-08-30 15:00:00,31,M,72.2671,15.55


<h1>Autopsy Rates</h1>

In [47]:
# LABEL: autopsy consent/decline

# Query mimic for notes
notes_query = \
"""
select distinct n.hadm_id,n.category,n.text,n.chartdate,n.charttime
from mimiciii.noteevents n
where iserror IS NULL --this is null in mimic 1.4, rather than empty space
and hadm_id IS NOT NULL
;
"""
notes = pd.read_sql_query(notes_query, con)

autopsy_consent = []
autopsy_decline = []
for hadm_id,rows in tqdm.tqdm(notes.groupby('hadm_id')):
    consented = False
    declined = False
    for text in rows.text.values:
        for line in text.lower().split('\n'):
            if 'autopsy' in line:
                if 'decline' in line:
                    declined = True
                if 'not consent' in line:
                    declined = True
                if 'refuse' in line:
                    declined = True
                if 'denied' in line:
                    declined = True
                    
                if 'consent' in line:
                    consented = True
                if 'agree' in line:
                    consented = True
                if 'request' in line:
                    consented = True

    # probably some "declined donation but consented to autopsy" or something confusing. just ignore hard cases
    if consented and declined:
        continue

    if consented:
        autopsy_consent.append(hadm_id)
    if declined:
        autopsy_decline.append(hadm_id)
        

100%|██████████| 58361/58361 [00:41<00:00, 1414.88it/s]


In [48]:

for race in ['White', 'Black', 'Asian', 'Native American', 'Hispanic', 'Not Specified', 'Other']:
    cohort = eol_cohort.loc[eol_cohort['race']==race]
    consent = cohort.loc[cohort['hadm_id'].isin(autopsy_consent)]
    decline = cohort.loc[cohort['hadm_id'].isin(autopsy_decline)]

    print race
    print '\tautopsy consent:', len(consent)
    print '\tautopsy decline:', len(decline)
    print '\t% consent:', len(consent)/(len(consent)+len(decline)+1e-9)
    print 


White
	autopsy consent: 144
	autopsy decline: 421
	% consent: 0.254867256637

Black
	autopsy consent: 29
	autopsy decline: 45
	% consent: 0.391891891887

Asian
	autopsy consent: 2
	autopsy decline: 20
	% consent: 0.090909090905

Native American
	autopsy consent: 0
	autopsy decline: 1
	% consent: 0.0

Hispanic
	autopsy consent: 9
	autopsy decline: 9
	% consent: 0.499999999972

Not Specified
	autopsy consent: 30
	autopsy decline: 54
	% consent: 0.357142857139

Other
	autopsy consent: 19
	autopsy decline: 34
	% consent: 0.358490566031



<h1>Table One</h1>

In [49]:
# Table One

from tableone import TableOne
import pandas as pd
import matplotlib.pyplot as plt

los = eol_cohort['dischtime'] - eol_cohort['admittime']
eol_cohort['los'] = los.apply(lambda t:t.seconds/3600.)

# optionally, a categorical variable for stratification
groupby = ['race']

# columns to be summarized
columns = ['discharge_location', 'gender', 'los', 'age'] 

# columns containing categorical variables
categorical = ['discharge_location', 'gender']

# non-normal variables
nonnormal = ['age', 'los']

# alternative labels
labels={'los': 'Length of stay', 'age': 'Age', 'race':'Race',
        'gender':'Gender', 'discharge_location':'Discharge Location'}

# combine all information
#grouped_df = pd.merge(eol_cohort, demographics, on=['hadm_id'])
grouped_df = eol_cohort

# create an instance of TableOne with the input arguments
grouped_table = TableOne(grouped_df, columns, categorical, groupby, nonnormal, labels=labels, isnull=False, pval=True)

# view tableone
grouped_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Race,Grouped by Race,Grouped by Race,Grouped by Race,Grouped by Race,Grouped by Race,Grouped by Race,Grouped by Race,Grouped by Race
Unnamed: 0_level_1,Unnamed: 1_level_1,Asian,Black,Hispanic,Native American,Not Specified,Other,White,pval,ptest
variable,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
n,,303,1158,292,7,1040,589,9503,,
Discharge Location,Deceased,128 (42.24),352 (30.4),103 (35.27),4 (57.14),584 (56.15),285 (48.39),3411 (35.89),0.0,Chi-squared (warning: expected count < 5)
Discharge Location,Hospice,16 (5.28),40 (3.45),20 (6.85),,32 (3.08),23 (3.9),418 (4.4),,
Discharge Location,Skilled Nursing Facility,159 (52.48),766 (66.15),169 (57.88),3 (42.86),424 (40.77),281 (47.71),5674 (59.71),,
Gender,F,133 (43.89),701 (60.54),132 (45.21),4 (57.14),521 (50.1),284 (48.22),4764 (50.13),0.0,Chi-squared (warning: expected count < 5)
Gender,M,170 (56.11),457 (39.46),160 (54.79),3 (42.86),519 (49.9),305 (51.78),4739 (49.87),,
Length of stay,,"14.73 [7.03,19.58]","13.75 [5.22,19.53]","14.30 [8.68,20.20]","16.67 [10.65,21.06]","13.85 [5.85,19.18]","14.10 [5.45,19.75]","13.98 [6.13,19.44]",0.282,Kruskal-Wallis
Age,,"76.38 [64.38,84.66]","71.49 [60.59,80.30]","66.55 [54.85,77.82]","70.82 [49.22,84.73]","75.51 [64.30,83.20]","74.18 [62.68,81.33]","77.93 [66.79,84.90]",0.0,Kruskal-Wallis


<h1>All Metrics Correlation</h1>

In [50]:
# Load all scores

def normalize(scores):
    mu  = sum(scores.values())
    std = np.std(scores.values())
    return {k:(v-mu)/std for k,v in scores.items()}

# Get the OASIS scores
oasis_query = 'SELECT distinct hadm_id,max(oasis) as oasis FROM mimiciii.oasis GROUP BY hadm_id'
oasis = pd.read_sql_query(oasis_query, con)
oasis_scores = normalize(dict(oasis[['hadm_id','oasis']].values))

# Mistrust scores
with open('../data/mistrust_noncompliant.pkl', 'rb') as f:
    noncompliant_scores = normalize(pickle.load(f))
with open('../data/mistrust_autopsy.pkl', 'rb') as f:
    autopsy_scores = normalize(pickle.load(f))
with open('../data/neg_sentiment.pkl', 'rb') as f:
    negsent_scores = normalize(pickle.load(f))

# To make my for loop work easier
race_scores = {}
for i,row in eol_cohort.iterrows():
    if row.race == 'White':
        race_scores[row.hadm_id] = 0
    elif row.race == 'Black':
        race_scores[row.hadm_id] = 1

sa_ids = set(negsent_scores.keys()) & set(oasis_scores.keys()) & \
         set(noncompliant_scores.keys()) & set(autopsy_scores.keys()) & set(race_scores.keys())

print len(sa_ids)

10401


In [57]:
def select(scores):
    return [scores[hadm_id] for hadm_id in sa_ids]
    
all_scores = {'oasis':select(oasis_scores), 'sentiment':select(negsent_scores), 
              'noncompliant':select(noncompliant_scores), 'autopsy':select(autopsy_scores)}
metrics = ['oasis', 'noncompliant', 'autopsy', 'sentiment']

from scipy.stats import pearsonr

print ' '*15,
for m in metrics:
    print '%19s' % m,
print
for i in range(len(metrics)):
    m1 = metrics[i]
    print '%-15s' % m1, 
    for j in range(len(metrics)):
        m2 = metrics[j]
        print '%19.3f' % pearsonr(all_scores[m1], all_scores[m2])[0], 
    print

                              oasis        noncompliant             autopsy           sentiment
oasis                         1.000               0.041              -0.055               0.069
noncompliant                  0.041               1.000               0.343               0.072
autopsy                      -0.055               0.343               1.000               0.033
sentiment                     0.069               0.072               0.033               1.000


<h1>Trust-based Sentiment Disparities</h1>

In [58]:

from scipy.stats import mannwhitneyu

def significant_mean_diff_test(label, white, black):
    W = white.values()
    B = black.values()
    stat, pval = mannwhitneyu(W, B)

    n1 = len(W)
    n2 = len(B)
    
    mW = sorted(W)[n1/2]
    mB = sorted(B)[n2/2]
    
    #print '%-15s: p=%f' % (label,pval)
    print '%-15s: n1=%d median(p1)=%.3f n2=%d median(p2)=%.3f p=%.5f' % (label,n1,mW,n2,mB,pval)
    

score_names = {'race':race_scores,
               'oasis':oasis_scores,
               'noncompliant':noncompliant_scores,
               'autopsy':autopsy_scores}

num_black = sum([race_scores[hadm_id] for hadm_id in sa_ids])

for name,scores in score_names.items():
    sscores = sorted([(hadm_id,scores[hadm_id]) for hadm_id in sa_ids], key=lambda t:t[1])
    low_scores  = dict(sscores[:-num_black])
    high_scores = dict(sscores[ -num_black:])

    # Stratify by trust
    low  = {hadm_id:negsent_scores[hadm_id] for hadm_id in  low_scores.keys()}
    high = {hadm_id:negsent_scores[hadm_id] for hadm_id in high_scores.keys()}
    significant_mean_diff_test(name, low, high)


autopsy        : n1=9275 median(p1)=0.100 n2=1126 median(p2)=0.424 p=0.00000
oasis          : n1=9275 median(p1)=0.118 n2=1126 median(p2)=0.215 p=0.00001
race           : n1=9275 median(p1)=0.126 n2=1126 median(p2)=0.168 p=0.01134
noncompliant   : n1=9275 median(p1)=0.106 n2=1126 median(p2)=0.328 p=0.00000
