<h1>Outcomes Using Trust Features</h1>
Does performance improve for tasks (code status, leaving AMA, and in-hosp mortality) when adding mistrust features on top of demographics? 
Yes

In [1]:
import psycopg2
import pandas as pd
from time import gmtime, strftime
import tqdm

con = psycopg2.connect(dbname ='mimic', user='wboag', host="/var/run/postgresql")
cur = con.cursor()

In [2]:
print strftime("%Y-%m-%d %H:%M:%S", gmtime())

# LABEL: code status

code_query = "select distinct hadm_id,label,value from mimiciii.chartevents c JOIN mimiciii.d_items i on i.itemid=c.itemid where label = 'Code Status'"
code_status = pd.read_sql_query(code_query, con)

# binary labels
code_labels = {}
for i,row in tqdm.tqdm(code_status.iterrows()):
    if row.value is not None:
        if ('DNR' in row.value) or ('DNI' in row.value) or ('Comfort' in row.value) or ('Do Not' in row.value):
            label = 'DNR/CMO'
        elif (row.value == 'Full Code') or (row.value == 'Full code'):
            label = 'Full Code'
    code_labels[row.hadm_id] = label
    
code_status.head()

2018-05-12 20:31:30


46121it [00:06, 7397.01it/s]


Unnamed: 0,hadm_id,label,value
0,100003,Code Status,Full code
1,100006,Code Status,Full Code
2,100007,Code Status,Full Code
3,100009,Code Status,Full code
4,100011,Code Status,Full code


In [3]:
print set(code_status['value'].values)

set(['DNR / DNI', 'DNR (do not resuscitate)', 'Other/Remarks', 'DNI (do not intubate)', 'Comfort measures only', 'Do Not Intubate', None, 'Full code', 'Full Code', 'Comfort Measures', 'CPR Not Indicate', 'Do Not Resuscita'])


In [4]:
# hadm -> race
import tqdm

def normalize_race(race):
    if 'HISPANIC' in race:
        return 'Hispanic'
    if 'SOUTH AMERICAN' in race:
        return 'Hispanic'
    if 'AMERICAN INDIAN' in race:
        return 'Native American'
    if 'ASIAN' in race:
        return 'Asian'
    if 'BLACK' in race:
        return 'Black'
    if 'WHITE' in race:
        return 'White'
    return 'Other'

def normalize_insurance(insurance):
    if insurance in ['Medicare', 'Medicaid', 'Government']:
        return 'Public'
    else:
        return insurance

In [5]:
# LABEL: left hospital against medical advice

# query for discharge info
discharge_query = 'SELECT distinct hadm_id,discharge_location FROM mimiciii.admissions'
discharge = pd.read_sql_query(discharge_query, con)

# binary labels
ama_labels = {}
for i,row in tqdm.tqdm(discharge.iterrows()):
    if row.discharge_location == 'LEFT AGAINST MEDICAL ADVI':
        label = 'AMA'
    else:
        label = 'compliant'
    ama_labels[row.hadm_id] = label

discharge.head()

58976it [00:05, 10909.70it/s]


Unnamed: 0,hadm_id,discharge_location
0,191772,DEAD/EXPIRED
1,174565,HOME
2,177287,HOME
3,110313,HOME HEALTH CARE
4,127542,HOME HEALTH CARE


In [6]:
# LABEL: in-hospital mortality

# query for discharge info
mortality_query = 'SELECT distinct hadm_id,hospital_expire_flag FROM mimiciii.admissions'
mortality = pd.read_sql_query(mortality_query, con)

# binary labels
mortality_labels = {}
for i,row in tqdm.tqdm(mortality.iterrows()):
    if row.hospital_expire_flag:
        label = 'deceased'
    else:
        label = 'survived'
    mortality_labels[row.hadm_id] = label

mortality.head()

58976it [00:04, 12035.90it/s]


Unnamed: 0,hadm_id,hospital_expire_flag
0,150909,0
1,111668,0
2,194641,0
3,193891,0
4,166563,0


In [7]:
import random

def data_split(ids, ratio=0.6):
    random.shuffle(ids)
    train = ids[:int(len(ids)*ratio) ]
    test  = ids[ int(len(ids)*ratio):]
    return train, test

In [8]:
# write informative features code

def analyze(task, vect, clf, count_top=False):

    ind2feat =  { i:f for f,i in vect.vocabulary_.items() }

    # create a 2-by-m matrix for biary, rather than relying on 1-p bullshit
    coef_ = clf.coef_
    
    # most informative features
    #"""
    print task
    informative_feats = np.argsort(coef_)
    
    if len(informative_feats.shape) == 2:
        informative_feats = informative_feats[0,:]
        coef_ = coef_[0,:]
        
    #'''
    # display what each feature is
    for feat in reversed(informative_feats):
        val = coef_[feat]

        word = ind2feat[feat]
        print '\t%-25s: %7.4f' % (word,val)
        

In [9]:
%matplotlib inline

import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import pylab as plt


def compute_stats(task, pred, P, ref, labels_map, verbose):
    if len(labels_map) == 2:
        scores = P[:,1] - P[:,0]
        res = compute_stats_binary(    task, pred, scores, ref, labels_map, verbose)
    else:
        res = compute_stats_multiclass(task, pred, P     , ref, labels_map, verbose)
    return res



def compute_stats_binary(task, pred, P, ref, labels, verbose):
    # santiy check
    assert all(map(int,P>0) == pred)

    V = [0,1]
    n = len(V)
    assert n==2, 'sorry, must be exactly two labels (how else would we do AUC?)'
    conf = np.zeros((n,n), dtype='int32')
    for p,r in zip(pred,ref):
        conf[p][r] += 1

    if verbose:
        print conf
        print
    
    tp = conf[1,1]
    tn = conf[0,0]
    fp = conf[1,0]
    fn = conf[0,1]

    precision   = tp / (tp + fp + 1e-9)
    recall      = tp / (tp + fn + 1e-9)
    sensitivity = tp / (tp + fn + 1e-9)
    specificity = tn / (tn + fp + 1e-9)

    f1 = (2*precision*recall) / (precision+recall+1e-9)

    tpr =  true_positive_rate(pred, ref)
    fpr = false_positive_rate(pred, ref)

    accuracy = (tp+tn) / (tp+tn+fp+fn + 1e-9)
    
    if verbose:
        print '\tspecificity %.3f' % specificity
        print '\tsensitivty: %.3f' % sensitivity

    # AUC
    if len(set(ref)) == 2:
        auc = sklearn.metrics.roc_auc_score(ref, P)
        if verbose: print '\t\tauc:        %.3f' % auc

    if verbose:
        print '\taccuracy:   %.3f' % accuracy
        print '\tprecision:  %.3f' % precision
        print '\trecall:     %.3f' % recall
        print '\tf1:         %.3f' % f1
        print '\tTPR:        %.3f' % tpr
        print '\tFPR:        %.3f' % fpr

        print 'TODO: VIZ THE ROC CURVE'

    res = {'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1':f1, 'tpr':tpr,
           'fpr':fpr, 'auc':auc, 'sensitivity':sensitivity, 'specificity':specificity}

    return res



def compute_stats_multiclass(task, pred, P, ref, labels_map):
    # santiy check
    assert all(map(int,P.argmax(axis=1)) == pred)

    # get rid of that final prediction dimension
    #pred = pred[1:]
    #ref  =  ref[1:]

    V = set(range(len(labels_map)))
    n = max(V)+1
    conf = np.zeros((n,n), dtype='int32')
    for p,r in zip(pred,ref):
        conf[p][r] += 1


    labels = [label for label,i in sorted(labels_map.items(), key=lambda t:t[1])]


    print conf
    print
    
    precisions = []
    recalls = []
    f1s = []
    print '\t prec  rec    f1   label'
    for i in range(n):
        label = labels[i]

        tp = conf[i,i]
        pred_pos = conf[i,:].sum()
        ref_pos  = conf[:,i].sum()

        precision   = tp / (pred_pos + 1e-9)
        recall      = tp / (ref_pos + 1e-9)
        f1 = (2*precision*recall) / (precision+recall+1e-9)

        print '\t%.3f %.3f %.3f %s' % (precision,recall,f1,label)

        # Save info
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    avg_precision = sum(precisions) / len(precisions)
    avg_recall    = sum(recalls   ) / len(recalls   )
    avg_f1        = sum(f1s       ) / len(f1s       )
    print '\t--------------------------'
    print '\t%.3f %.3f %.3f avg' % (avg_precision,avg_recall,avg_f1)

    print 'TODO: VIZ THE F1S'

    
    res = {'precisions':precisions, 'recalls':recalls, 'f1s':f1s}

    return res



def true_positive_rate(pred, ref):
    tp,fn = 0,0
    for p,r in zip(pred,ref):
        if p==1 and r==1:
            tp += 1
        elif p==0 and r==1:
            fn += 1
    return tp / (tp + fn + 1e-9)


def false_positive_rate(pred, ref):
    fp,tn = 0,0
    for p,r in zip(pred,ref):
        if p==1 and r==0:
            fp += 1
        elif p==0 and r==0:
            tn += 1
    return fp / (fp + tn + 1e-9)




def classification_results(svm, labels_map, X, Y, task, verbose=True):

    # for AUC
    P_ = svm.decision_function(X)

    # sklearn has stupid-ass changes in API when doing binary classification. make it conform to 3+
    if len(labels_map)==2:
        m = X.shape[0]
        P = np.zeros((m,2))
        P[:,0] = -P_
        P[:,1] =  P_
    else:
        P = P_

    train_pred = P.argmax(axis=1)

    # what is the predicted vocab without the dummy label?
    V = labels_map.keys()

    if verbose: print task
    res = compute_stats(task, train_pred, P, Y, labels_map, verbose)
    if verbose: print '\n'
    return res
    


def regression_results(lr, test_X, test_Y, description, verbose=True):
    res = {}
    
    pred_Y = lr.predict(test_X)
    res['rms'] = sqrt(mean_squared_error(test_Y, pred_Y))
    res['mas'] = mean_absolute_error(test_Y, pred_Y)
    if verbose:
        print description
        print '\tRMS:', res['rms']
        print '\tMAS:', res['mas']
        print
    
        fig = plt.figure()
        perfect = np.arange(min(test_Y),max(test_Y),100)
        plt.scatter(perfect, perfect, color='red', s=0.01)
        plt.scatter(test_Y , pred_Y, color='blue', s=1)
        plt.xlabel('actual')
        plt.ylabel('prediction')
        plt.show()
    
    return res

In [28]:
# Load features

import cPickle as pickle

def normalize(scores):
    vals = np.array(scores.values())
    mu = vals.mean()
    std = vals.std()
    return { k:(v-mu)/std for k,v in scores.items()}


# query for insurance info
insurance_query = 'SELECT distinct hadm_id,insurance FROM mimiciii.admissions'
insurance = pd.read_sql_query(insurance_query, con)

# query for oasis info
oasis_query = 'SELECT distinct hadm_id,oasis FROM mimiciii.oasis'
oasis = pd.read_sql_query(oasis_query, con)

# query for demographics info
patients_query = 'SELECT distinct hadm_id,gender,age,ethnicity,admission_type,los_hospital FROM mimiciii.icustay_detail'
patients = pd.read_sql_query(patients_query, con)
patients = patients.loc[patients['admission_type']!='NEWBORN']

# Load trust scores
with open('../data/mistrust_noncompliant.pkl', 'rb') as f:
    noncompliant_dict = normalize(pickle.load(f))
print 'noncompliant:', len(noncompliant_dict)
noncompliant_df = pd.DataFrame(noncompliant_dict.items(), columns=['hadm_id','noncompliant'])

# Load trust scores
with open('../data/mistrust_autopsy.pkl', 'rb') as f:
    autopsy_dict = normalize(pickle.load(f))
print 'autopsy:', len(autopsy_dict)
autopsy_df = pd.DataFrame(autopsy_dict.items(), columns=['hadm_id','autopsy'])

# Load trust scores
with open('../data/neg_sentiment.pkl', 'rb') as f:
    sentiment_dict = normalize(pickle.load(f))
print 'sentiment:', len(sentiment_dict)
sentiment_df = pd.DataFrame(sentiment_dict.items(), columns=['hadm_id','sentiment'])

    
# merge data
extra_1 = pd.merge(insurance, oasis, on=['hadm_id'])
extra_2 = pd.merge(extra_1, noncompliant_df, on=['hadm_id'])
extra_3 = pd.merge(extra_2, autopsy_df     , on=['hadm_id'])
extra_4 = pd.merge(extra_3, sentiment_df   , on=['hadm_id'])
demographics = pd.merge(extra_4, patients  , on=['hadm_id'])

# Normalize some columns
demographics['ethnicity'] = demographics['ethnicity'].apply(normalize_race)
demographics['insurance'] = demographics['insurance'].apply(normalize_insurance)
demographics = demographics.rename(columns={'ethnicity':'race'})
demographics = demographics.rename(columns={'los_hospital':'los'})

demographics.head()

noncompliant: 54510
autopsy: 54510
sentiment: 48273
extra1: 61382
extra2: 58009
extra3: 58009
extra4: 51522


Unnamed: 0,hadm_id,insurance,oasis,noncompliant,autopsy,sentiment,gender,age,race,admission_type,los
0,191826,Public,22,-0.659708,-0.430902,0.267377,M,80.6794,White,ELECTIVE,10.2431
1,127133,Private,29,1.696974,-1.410404,-0.336874,M,63.4036,White,EMERGENCY,29.9319
2,127133,Private,47,1.696974,-1.410404,-0.336874,M,63.4036,White,EMERGENCY,29.9319
3,110408,Public,32,-0.806254,-0.424124,-0.450595,F,64.0647,Other,EMERGENCY,15.6868
4,191517,Private,23,0.235403,0.555377,1.184329,M,51.8053,White,EMERGENCY,3.1125


In [29]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer

print strftime("%Y-%m-%d %H:%M:%S")


def normalize_mean_std(value, mu, std):
    return (value-mu)/std
    
# normalize ages
ages = np.array(demographics['age'])
age_mu = ages.mean()
age_std = ages.std()
demographics['age'] = demographics['age'].apply(lambda val:normalize_mean_std(val,age_mu,age_std))

# normalize oasis scores
oasis = np.array(demographics['oasis'])
oasis_mu = oasis.mean()
oasis_std = oasis.std()
demographics['oasis'] = demographics['oasis'].apply(lambda val:normalize_mean_std(val,oasis_mu,oasis_std))

# normalize los scores
los = np.array(demographics['los'])
los_mu = los.mean()
los_std = los.std()
demographics['los'] = demographics['los'].apply(lambda val:normalize_mean_std(val,los_mu,los_std))

# foo

def build_features(enabled):
    demographics_features = {}
    for i,row in tqdm.tqdm(demographics.iterrows()):
        feats = {}

        if 'admission_type' in enabled: feats[('admission_type', row.admission_type   )] = 1
        if 'oasis'          in enabled: feats[('oasis', None)] = row.oasis

        if 'age' in enabled: feats[('age'  , None)] = row.age
        if 'los' in enabled: feats[('los'  , None)] = row.los

        if 'insurance' in enabled: feats[('insurance'     , row.insurance)] = 1
        if 'gender'    in enabled: feats[('gender'        , row.gender   )] = 1

        if 'race'     in enabled: feats[('race', row.race     )] = 1
            
        if 'noncompliant' in enabled: feats[('concompliant',None)] = row.noncompliant
        if 'autopsy'      in enabled: feats[('autopsy'     ,None)] = row.autopsy
        if 'sentiment'    in enabled: feats[('sentiment'   ,None)] = row.sentiment

        demographics_features[row.hadm_id] = feats

    print strftime("%Y-%m-%d %H:%M:%S")

    # fit vectorizer
    vect = DictVectorizer()
    vect.fit(demographics_features.values())
    print 'num_features:', len(vect.get_feature_names())

    # ordering of all features
    ids = demographics_features.keys()
    print '\t', strftime("%Y-%m-%d %H:%M:%S")
    X = vect.transform([demographics_features[hadm_id] for hadm_id in ids])    

    return demographics_features, vect
    
print strftime("%Y-%m-%d %H:%M:%S")

2018-05-12 16:58:26
2018-05-12 16:58:26


In [43]:
# AMA
from collections import defaultdict, Counter

print strftime("%Y-%m-%d %H:%M:%S")
from sklearn.linear_model import LogisticRegression



featlists = {
                #'BASELINE'             :['age', 'los', 'insurance', 'gender'],
                #'BASELINE+RACE'        :['age', 'los', 'insurance', 'gender', 'race'],
                #'BASELINE+NONCOMPLIANT':['age', 'los', 'insurance', 'gender', 'noncompliant'],
                #'BASELINE+AUTOPSY'     :['age', 'los', 'insurance', 'gender', 'autopsy'],
                #'BASELINE+SENTIMENT'   :['age', 'los', 'insurance', 'gender', 'sentiment'],
                'BASELINE+ALL'         :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']
            }

ama_Y_vect = {'AMA': 1, 'compliant': 0}

feature_weights = defaultdict(list)

for name,featlist in featlists.items():
    print name
    print featlist
    
    demographics_features, vect = build_features(featlist)
    ind2feat =  { i:f for f,i in vect.vocabulary_.items() }

    ama_ids = list(set(discharge['hadm_id'].values) & set(demographics_features.keys()))
    print 'patients:', len(ama_ids)
  
    print Counter([ama_Y_vect[ama_labels[hadm_id]] for hadm_id in ama_ids])

    aucs = []
    for iteration in tqdm.tqdm(range(100)):

        # train/test split
        ama_train_ids, ama_test_ids = data_split(ama_ids)

        # select pre-computed features
        ama_train_features = [demographics_features[hadm_id] for hadm_id in ama_train_ids]
        ama_test_features  = [demographics_features[hadm_id] for hadm_id in ama_test_ids ]

        # vectorize features
        ama_train_X = vect.transform(ama_train_features)
        ama_test_X  = vect.transform(ama_test_features)

        # vectorize task-specific labels
        #print ama_Y_vect

        # select labels
        ama_train_Y = [ama_Y_vect[ama_labels[hadm_id]] for hadm_id in ama_train_ids]
        ama_test_Y  = [ama_Y_vect[ama_labels[hadm_id]] for hadm_id in ama_test_ids ]

        # fit model
        ama_svm = LogisticRegression(C=0.1, penalty='l1', tol=0.01)
        ama_svm.fit(ama_train_X,ama_train_Y)
        #print ama_svm


        # AMA Model eval

        # evaluate model
        res = classification_results(ama_svm, ama_Y_vect,  ama_test_X,  ama_test_Y, 'test:  ama', verbose=False)
        aucs.append(res['auc'])

        # record the weights of the features (because we average them)
        if name == 'BASELINE+ALL':
            for feat,val in enumerate(ama_svm.coef_.tolist()[0]):
                featname = ind2feat[feat]
                feature_weights[featname].append(val)

        #classification_results(ama_svm, ama_Y_vect, ama_train_X, ama_train_Y, 'train: ama')

    aucs = np.array(aucs)
    print 'AUCS: ', aucs
    print '    mean:     ', aucs.mean()
    print '    1.96*std: ', aucs.std() * 1.96
    print '    conf_interval: (%.4f,%.4f)' % (aucs.mean()-1.96*aucs.std(),aucs.mean()+1.96*aucs.std())


    # most informative features
    analyze('ama', vect, ama_svm)
    print '\n\n\n'

    if name == 'BASELINE+ALL':
        for featname,vals in sorted(feature_weights.items()):
            v = np.array(vals)
            mu = v.mean()
            std = v.std()
            print '%-10s:%-15s || %.2f +/- %.2f' % (featname[0],featname[1],mu,1.96*std)

print strftime("%Y-%m-%d %H:%M:%S")

20it [00:00, 199.51it/s]

2018-05-12 18:13:31
BASELINE+ALL
['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']


51522it [00:08, 6189.57it/s]


2018-05-12 18:13:40
num_features: 16
	2018-05-12 18:13:40


  0%|          | 0/100 [00:00<?, ?it/s]

patients: 48071
Counter({0: 47746, 1: 325})


100%|██████████| 100/100 [01:01<00:00,  1.62it/s]

AUCS:  [0.86304508 0.8898861  0.88454724 0.87209732 0.8691665  0.88331788
 0.89105331 0.84446035 0.87904439 0.84483964 0.86385253 0.87041656
 0.88775113 0.8785297  0.89844164 0.88767295 0.87012795 0.87562579
 0.88172111 0.87176786 0.87337841 0.86876047 0.87283639 0.83836708
 0.86955035 0.86759934 0.87989298 0.89410665 0.85591779 0.86279355
 0.88124919 0.89341509 0.87808205 0.86907466 0.87280057 0.85670457
 0.87209048 0.86640638 0.86984129 0.89132035 0.8632953  0.89894989
 0.87027522 0.87446934 0.89029219 0.86157223 0.87621635 0.88085429
 0.89322966 0.87153019 0.87043719 0.87260386 0.86098986 0.8868004
 0.87057264 0.88764782 0.8840213  0.86125103 0.8625529  0.87929983
 0.8500925  0.86213968 0.87824507 0.88079966 0.88007555 0.88075165
 0.86605238 0.87218972 0.90277366 0.87567829 0.87541977 0.88838886
 0.83465103 0.87513934 0.83261367 0.88089711 0.83799851 0.88093045
 0.85280004 0.86700751 0.86270631 0.87246667 0.86793567 0.86684598
 0.8430358  0.86769759 0.85548898 0.8420679  0.88252209 




In [41]:
# Code Status

print strftime("%Y-%m-%d %H:%M:%S")


from sklearn.linear_model import LogisticRegression



featlists = {
                #'BASELINE'             :['age', 'los', 'insurance', 'gender'],
                #'BASELINE+RACE'        :['age', 'los', 'insurance', 'gender', 'race'],
                #'BASELINE+NONCOMPLIANT':['age', 'los', 'insurance', 'gender', 'noncompliant'],
                #'BASELINE+AUTOPSY'     :['age', 'los', 'insurance', 'gender', 'autopsy'],
                #'BASELINE+SENTIMENT'   :['age', 'los', 'insurance', 'gender', 'sentiment'],
                'BASELINE+ALL'         :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']
            }

cs_Y_vect = {'DNR/CMO': 1, 'Full Code': 0}

feature_weights = defaultdict(list)

for name,featlist in featlists.items():
    print name
    print featlist
    
    demographics_features, vect = build_features(featlist)
    ind2feat =  { i:f for f,i in vect.vocabulary_.items() }
    
    print ind2feat

    cs_ids = list(set(code_labels.keys()) & set(demographics_features.keys()))
    print 'patients:', len(cs_ids)
    
    print Counter([cs_Y_vect[code_labels[hadm_id]] for hadm_id in cs_ids])

    
    aucs = []
    for iteration in tqdm.tqdm(range(100)):

        #print 'Iter:', iteration

        # train/test split
        cs_train_ids, cs_test_ids = data_split(cs_ids)

        # select pre-computed features
        cs_train_features = [demographics_features[hadm_id] for hadm_id in cs_train_ids]
        cs_test_features  = [demographics_features[hadm_id] for hadm_id in cs_test_ids ]

        # vectorize features
        cs_train_X = vect.transform(cs_train_features)
        cs_test_X  = vect.transform(cs_test_features)

        # vectorize task-specific labels
        cs_Y_vect = {'DNR/CMO': 1, 'Full Code': 0}
        #print cs_Y_vect

        # select labels
        cs_train_Y = [cs_Y_vect[code_labels[hadm_id]] for hadm_id in cs_train_ids]
        cs_test_Y  = [cs_Y_vect[code_labels[hadm_id]] for hadm_id in cs_test_ids ]

        # fit model
        cs_svm = LogisticRegression(C=0.1, penalty='l1', tol=0.01)
        cs_svm.fit(cs_train_X,cs_train_Y)
        #print cs_svm


        # cs Model eval

        # evaluate model
        res = classification_results(cs_svm, cs_Y_vect, cs_test_X,  cs_test_Y, 'test:  cs', verbose=False)
        aucs.append(res['auc'])

        # record the weights of the features (because we average them)
        if name == 'BASELINE+ALL':
            for feat,val in enumerate(cs_svm.coef_.tolist()[0]):
                featname = ind2feat[feat]
                feature_weights[featname].append(val)
            
        # most informative features
        #analyze('cs', vect, cs_svm)
        
    aucs = np.array(aucs)
    print 'AUCS: ', aucs
    print '    mean:     ', aucs.mean()
    print '    1.96*std: ', aucs.std() * 1.96
    print '    conf_interval: (%.4f,%.4f)' % (aucs.mean()-1.96*aucs.std(),aucs.mean()+1.96*aucs.std())


    # most informative features
    analyze('cs', vect, cs_svm)

    if name == 'BASELINE+ALL':
        for featname,vals in sorted(feature_weights.items()):
            v = np.array(vals)
            mu = v.mean()
            std = v.std()
            print '%-10s:%-15s || %.2f +/- %.2f' % (featname[0],featname[1],mu,1.96*std)
    
print strftime("%Y-%m-%d %H:%M:%S")

514it [00:00, 5130.24it/s]

2018-05-12 18:11:24
BASELINE+ALL
['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']


51522it [00:07, 6572.52it/s]


2018-05-12 18:11:31
num_features: 16
	2018-05-12 18:11:32


  0%|          | 0/100 [00:00<?, ?it/s]

{0: ('age', None), 1: ('autopsy', None), 2: ('concompliant', None), 3: ('gender', 'F'), 4: ('gender', 'M'), 5: ('insurance', 'Private'), 6: ('insurance', 'Public'), 7: ('insurance', 'Self Pay'), 8: ('los', None), 9: ('race', 'Asian'), 10: ('race', 'Black'), 11: ('race', 'Hispanic'), 12: ('race', 'Native American'), 13: ('race', 'Other'), 14: ('race', 'White'), 15: ('sentiment', None)}
patients: 39815
Counter({0: 37359, 1: 2456})


100%|██████████| 100/100 [00:53<00:00,  1.87it/s]

AUCS:  [0.78324652 0.77849969 0.77258877 0.78107285 0.77630158 0.78631457
 0.78667362 0.78527314 0.77617271 0.77064476 0.78380085 0.77310122
 0.78967162 0.77809387 0.78098194 0.79000061 0.77628243 0.78179626
 0.77897043 0.77930792 0.77893593 0.78719704 0.79322074 0.77774729
 0.78029961 0.78037309 0.7723545  0.78490407 0.78631093 0.78732274
 0.78235319 0.77329771 0.77591114 0.77956341 0.78691246 0.77675309
 0.776622   0.78522874 0.78342732 0.78534675 0.76544258 0.78088814
 0.79167358 0.79028152 0.78033415 0.7828873  0.78276036 0.77231833
 0.77822337 0.7872044  0.79584637 0.78630402 0.7823201  0.78161895
 0.78210182 0.78387315 0.78035303 0.77693518 0.79571572 0.78778787
 0.78069838 0.77774305 0.78566457 0.77782712 0.78919824 0.78370198
 0.77462228 0.78476863 0.7792839  0.78306203 0.79569874 0.76825369
 0.78912419 0.78038647 0.78903377 0.77597883 0.78139146 0.77674471
 0.78620727 0.78381803 0.77374165 0.78071827 0.79129715 0.78083564
 0.78965661 0.78486894 0.78861609 0.78493896 0.79438435




In [44]:
# Mortality

print strftime("%Y-%m-%d %H:%M:%S")


from sklearn.linear_model import LogisticRegression


featlists = {
                #'BASELINE'             :['age', 'los', 'insurance', 'gender'],
                #'BASELINE+RACE'        :['age', 'los', 'insurance', 'gender', 'race'],
                #'BASELINE+NONCOMPLIANT':['age', 'los', 'insurance', 'gender', 'noncompliant'],
                #'BASELINE+AUTOPSY'     :['age', 'los', 'insurance', 'gender', 'autopsy'],
                #'BASELINE+SENTIMENT'   :['age', 'los', 'insurance', 'gender', 'sentiment'],
                'BASELINE+ALL'         :['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']
            }
mortality_Y_vect = {'deceased': 1, 'survived': 0}

feature_weights = defaultdict(list)


for name,featlist in featlists.items():
    print name
    print featlist
    
    demographics_features, vect = build_features(featlist)
    ind2feat =  { i:f for f,i in vect.vocabulary_.items() }

    mortality_ids = list(set(mortality_labels.keys()) & set(demographics_features.keys()))
    print 'patients:', len(mortality_ids)
    
    print Counter([mortality_Y_vect[mortality_labels[hadm_id]] for hadm_id in mortality_ids])

    aucs = []
    for iteration in tqdm.tqdm(range(100)):

        #print 'Iter:', iteration


        # train/test split
        mortality_train_ids, mortality_test_ids = data_split(mortality_ids)

        # select pre-computed features
        mortality_train_features = [demographics_features[hadm_id] for hadm_id in mortality_train_ids]
        mortality_test_features  = [demographics_features[hadm_id] for hadm_id in mortality_test_ids ]

        # vectorize features
        mortality_train_X = vect.transform(mortality_train_features)
        mortality_test_X  = vect.transform(mortality_test_features)

        # vectorize task-specific labels
        #print mortality_Y_vect

        # select labels
        mortality_train_Y = [mortality_Y_vect[mortality_labels[hadm_id]] for hadm_id in mortality_train_ids]
        mortality_test_Y  = [mortality_Y_vect[mortality_labels[hadm_id]] for hadm_id in mortality_test_ids ]

        # fit model
        mortality_svm = LogisticRegression(C=0.1, penalty='l1', tol=0.01)
        mortality_svm.fit(mortality_train_X,mortality_train_Y)
        #print mortality_svm


        # mortality Model eval

        # evaluate model
        res = classification_results(mortality_svm, mortality_Y_vect, mortality_test_X, mortality_test_Y, 'test:  mortality', verbose=False)
        aucs.append(res['auc'])

        # record the weights of the features (because we average them)
        if name == 'BASELINE+ALL':
            for feat,val in enumerate(mortality_svm.coef_.tolist()[0]):
                featname = ind2feat[feat]
                feature_weights[featname].append(val)
            
        # most informative features
        #analyze('mortality', vect, mortality_svm)
        
    aucs = np.array(aucs)
    print 'AUCS: ', aucs
    print '    mean:     ', aucs.mean()
    print '    1.96*std: ', aucs.std() * 1.96
    print '    conf_interval: (%.4f,%.4f)' % (aucs.mean()-1.96*aucs.std(),aucs.mean()+1.96*aucs.std())


    # most informative features
    analyze('mortality', vect, mortality_svm)

    # foo


    if name == 'BASELINE+ALL':
        for featname,vals in sorted(feature_weights.items()):
            v = np.array(vals)
            mu = v.mean()
            std = v.std()
            print '%-10s:%-15s || %.2f +/- %.2f' % (featname[0],featname[1],mu,1.96*std)
            
    print '\n\n'
    
print strftime("%Y-%m-%d %H:%M:%S")

0it [00:00, ?it/s]

2018-05-12 18:14:42
BASELINE+ALL
['age', 'los', 'insurance', 'gender', 'race', 'noncompliant', 'autopsy', 'sentiment']


51522it [00:07, 6450.03it/s]


2018-05-12 18:14:50
num_features: 16
	2018-05-12 18:14:50


  0%|          | 0/100 [00:00<?, ?it/s]

patients: 48071
Counter({0: 42961, 1: 5110})


100%|██████████| 100/100 [00:54<00:00,  1.82it/s]

AUCS:  [0.62663672 0.62554044 0.63556882 0.62835858 0.63007437 0.63280478
 0.64252526 0.62673491 0.62470338 0.62104103 0.63962794 0.63507344
 0.63179274 0.63274282 0.6245192  0.63277841 0.63802319 0.63263878
 0.63866185 0.63158812 0.64011622 0.64346836 0.62632134 0.63621778
 0.6311151  0.63555007 0.63217218 0.63435357 0.63448743 0.63917573
 0.63259436 0.6350698  0.63995601 0.63043239 0.63344198 0.63279208
 0.6340524  0.62895951 0.63989063 0.62852    0.63437974 0.62799249
 0.63266594 0.63489441 0.63226543 0.63160259 0.63506163 0.62727209
 0.62709061 0.64039158 0.6388388  0.6375865  0.63205127 0.63197106
 0.63126289 0.63305162 0.63643081 0.63880446 0.63810585 0.63442502
 0.64293466 0.63015363 0.6332782  0.63691409 0.63789679 0.63672263
 0.63472536 0.63507172 0.63622671 0.62634621 0.63057716 0.63296238
 0.64309214 0.6351612  0.63148087 0.63210499 0.63224196 0.63462201
 0.63555542 0.62997502 0.64211603 0.62775672 0.63095006 0.62733191
 0.6268335  0.63452361 0.62966049 0.63259643 0.63008711




In [55]:

metrics = {'noncompliant':noncompliant_dict, 'autopsy':autopsy_dict, 'sentiment':sentiment_dict}

for metric,scores in metrics.items():
    print metric

    vals = sorted(scores.values())
    n = len(vals)
    t1 = vals[1*n/4]
    t2 = vals[2*n/4]
    t3 = vals[3*n/4]

    lowest  = [hadm_id for hadm_id,score in scores.items() if     score<=t1]
    highest = [hadm_id for hadm_id,score in scores.items() if t3< score    ]

    def mort_rate(label, hadm_ids):
        cohort = mortality.loc[mortality['hadm_id'].isin(hadm_ids)]
        print '\t', label, sum(cohort['hospital_expire_flag'].values)/float(len(cohort))

    mort_rate('most  trust', lowest)
    mort_rate('least trust', highest)

autopsy
	most  trust 0.12787967718268525
	least trust 0.08814726129901228
noncompliant
	most  trust 0.043835616438356165
	least trust 0.13696418085731063
sentiment
	most  trust 0.07846549009859972
	least trust 0.14477730948855222
