In [5]:
#Generates 3D numpy array from mimic iii data.  One hot encoded.  Shards into num_shards numpy arrays.

import pandas as pd
import math
import numpy as np
import sys
from datetime import datetime

class Feature(object):
    def __init__(self, path="~/mimic3/data_full/"):
        self.path = path
        
    def get_admission(self):
        admission_df = pd.read_csv(self.path+'ADMISSIONS.csv', usecols=['SUBJECT_ID','HADM_ID','ADMITTIME'])
        
        #Sort by HADM_ID
        admission_df = admission_df.set_index('HADM_ID', drop = False)
        admission_df.sort_index(inplace=True)
        
        #Take first eigth of admissions (Memory constraint)
#         admission_df = admission_df.iloc[0:round(len(admission_df)/8)]
        return admission_df
    
    #loads the chartevents file
    
    def get_chartevents(self, items, adm):
        c_df = pd.read_csv(self.path+'CHARTEVENTS.csv', usecols=['HADM_ID','ITEMID','CHARTTIME','VALUENUM'])
        #takes rows for patients that appear in admissions,
        c_df = c_df[c_df['HADM_ID'].isin(adm['HADM_ID'])]
        #take measurement values that are not null.
        c_df = c_df[c_df['VALUENUM'].notnull()]
        #take rows with desired item IDs
        c_df = c_df[c_df.ITEMID.isin([x for v in items.values() for x in v])]
        #Normalize each measurent values to mean zero and std dev 1 accross each item ID
        for item_n in items.keys():
            for itid in items[item_n]:
                c_df.loc[c_df['ITEMID'] == itid,'VALUENUM'] = (c_df.loc[c_df['ITEMID']==itid, 'VALUENUM'] 
                                                           - c_df.loc[c_df['ITEMID']==itid, 'VALUENUM'].mean())/c_df.loc[c_df['ITEMID'] == itid,'VALUENUM'].std()
        return c_df 
    
    #Defines the item IDs that will be used in the dataset
    def get_items(self):
        HR = [220045, 211] #['Heart Rate', 'Heart Rate']
        #BPm = [220181 , 52, 456, 220052] #['Non Invasive Blood Pressure mean', 'Arterial BP Mean','NBP Mean', 'Arterial Blood Pressure mean']
        BP_S = [51]#220179, , 220050]  #['Non Invasive Blood Pressure systolic', 'Arterial BP [Systolic]', 455, 'NBP [Systolic]', 'Arterial Blood Pressure systolic']
        BP_D = [8368]#220180, , 220051] #, ['Non Invasive Blood Pressure diastolic', 'Arterial BP [Diastolic]', 8441'NBP [Diastolic]', 'Arterial Blood Pressure diastolic'] 
        CAP_Refill = [] 
        Glucose = [807]#, 225664, 811, 227015] # ['Fingerstick Glucose', 'Glucose finger stick','Glucose (70-105)'] 
        pH = [227037] 
        Temp = [223761]#, 678, 679, 227054]  #['Temperature Fahrenheit', 'Temperature F', 'Temperature F (calc)']
        UrineScore = [227059] 
        O2_Sat = [220277]#, 646, 227035] #['O2 saturation pulseoxymetry', 'SpO2'] 
        RespiratoryRate = [220210]#, 618, 615] #['Respiratory Rate', 'Respiratory Rate','Resp Rate (Total)']
        GCS = [198] #['GCS Total']
        FiO2 = [223835]#, 1040] #['Inspired O2 Fraction (%)']
        ETCO2 = [1817]
        return {
            'HR': HR,
            'BP_S': BP_S,
            'BP_D': BP_D,
            'CAP_Refill': CAP_Refill,
            'Glucose': Glucose,
            'pH': pH,
            'Temp': Temp,
            'UrineScore': UrineScore,
            'O2_Sat': O2_Sat,
            'RespiratoryRate': RespiratoryRate,
            'GCS': GCS,
            'FiO2': FiO2,
            'ETCO2': ETCO2
        }
#     def get_it_mean_std(self,items, ce):
#         valms = { item_n : ce[ce.ITEMID.isin(items[item_n])].VALUENUM.mean() for item_n in sorted(items.keys())}
#         valst = { item_n : ce[ce.ITEMID.isin(items[item_n])].VALUENUM.std() for item_n in sorted(items.keys())}
#         return (valms, valst)
        
    def get_relative_time(self, admit_time, chart_time):
        """
        admit_time, chart_time str
        :return value in hours
        """
        
        # Accepts strings in "2144-07-24 09:00:00" format and returns difference in seconds
        adm_time = datetime.strptime(admit_time, '%Y-%m-%d %H:%M:%S')
        char_time = datetime.strptime(chart_time, '%Y-%m-%d %H:%M:%S')
        del_time = char_time - adm_time
        rel_time = del_time.days*86400 + del_time.seconds
        return rel_time/3600
    

    def get_relative_time_per_admid2(self, df, admit_time):
        """
        :param df: chartevent df for particular HADM_ID
        :param admit_time: str
        :return tuple (new df with extra column, number of measurements)
        """
        rlt = []
        numMeas = 0
        for cTime in df['CHARTTIME']:
            temp = self.get_relative_time(admit_time, cTime)
            #set chart times that are before admittime to 0.
            if temp > 0:
                rlt.append(temp)
            else:
                rlt.append(0)
            numMeas += 1    

        df['RELTIME'] = rlt
        return (df, numMeas)
    
    def get_max_time_per_admid(self, df, admit_time):
        """
        :param df: chartevent df for particular HADM_ID
        :param admit_time: str
        :return tuple (new df with extra column, max time patient was monitored)
        """
        rlt = []
        max_time = 0
        for cTime in df['CHARTTIME']:
            temp = self.get_relative_time(admit_time, cTime)
            rlt.append(temp)
            if (temp > max_time):
                max_time = temp
        
        
        return max_time


    def get_max_time(self, chartevents_df, admission_df):
        max_t = 0
        for idx, admission in admission_df.iterrows():
            max_time = self.get_max_time_per_admid(
                chartevents_df[chartevents_df['HADM_ID'] == admission['HADM_ID']],
                admission['ADMITTIME'])# max_time is in hours
            if(max_time > max_t):
                max_t = max_time
        return int(max_t)
        
    def printl(self, string):
        print(string)
        sys.stdout.flush()
        
    def OH_F(self, ITID, itemsd):
        oh_v = np.zeros(len(itemsd))
        iidx = 0
        for item_name in sorted(itemsd.keys()):
            if ITID in itemsd[item_name]:
                oh_v[iidx] = 1
                th_item_name = item_name
            iidx += 1
        return (oh_v, th_item_name)
    
    def get_OH(self, admission_df,chartevents_df,items):
        # Generates and saves a numpy array consisting of measurements for all patients in admission_df
        
        #(valms, valst) = self.get_it_mean_std(items, chartevents_df)
        
        #Get rows for patients in admission_df
        chartevents_df = chartevents_df[chartevents_df['HADM_ID'].isin(admission_df['HADM_ID'])]
        
        self.printl('getting dimensions')
        #Find dimension of array (numPat x num_meas x feature length)
        numPat = len(admission_df.groupby('HADM_ID').size())
        maxNumMeas = chartevents_df.groupby('HADM_ID').size().max()
        feaLen = len(items)+2
        
        self.printl('making array')
        #Generate array of zeros that will be filled below
        features = np.zeros(shape=(numPat, maxNumMeas, feaLen))
        z = np.zeros(shape=(numPat))
        
        pat_idx = 0
        
        #For each patient in admission_df save the measurements to array
        for sbj_id, admission in admission_df.iterrows():
            
            cnt = 0 #indexes measurements
            
            self.printl(str(admission['HADM_ID']) + ' started')
            
            #Get (dataframe for a specific patient with Reltime column, number of measurements)
            (chartevents_perAdm_df, numMeas) = self.get_relative_time_per_admid2(
                    chartevents_df[chartevents_df['HADM_ID'] == admission['HADM_ID']],
                    admission['ADMITTIME'])
            
            #Save the sequence length to z
            z[pat_idx] = numMeas
            
#             #Generate 2D array for measurements of this patient.
#             feature_patient = np.zeros(shape=(features.shape[1], features.shape[2]))
            
            #Sort by time
            chartevents_perAdm_df = chartevents_perAdm_df.set_index('RELTIME', drop = False)
            chartevents_perAdm_df.sort_index(inplace=True)
            
            #for each row in chartevents create a row in 3D array 
            for c_idx, row in chartevents_perAdm_df.iterrows():
                ITID = row['ITEMID']
                #Generate one hot encoding for feature type.
                (oh_v, it_n) = self.OH_F(ITID, items)
                #assign corresponding row in array to the one hot vector, the time, and the value of measurement.
                features[pat_idx, cnt] = np.append(oh_v, [row['RELTIME'], row['VALUENUM']])
                cnt += 1
                
            pat_idx +=1
        return features, z
    
    def get_shards(self, num_shards):
        #Creates num_shards numpy arrays
        items = self.get_items()
        
        self.printl('getting admissions')
        adm_df = self.get_admission()
        
        self.printl('getting chart events')
        c_df = self.get_chartevents(items,adm_df)
        
        #set starting indices
        s_len = int(len(adm_df)/num_shards)
        st_idx = 0
          
            
        for i in np.arange(num_shards):
            #generate arrays for indices
            x,z = self.get_OH(adm_df[st_idx:st_idx+s_len],c_df,items)
            #save arrays
            np.save("xC"+str(i),x)
            np.save("zC"+str(i),z) 
            #roll arrays forward
            st_idx += s_len
#             np.roll(adm_df,s_len)
        
    

In [6]:
feature = Feature()
feature.get_shards(100)

print("done")

getting admissions
getting chart events
getting dimensions
making array
100001 started
100003 started
100006 started
100007 started


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


100009 started
100010 started
100011 started
100012 started
100014 started
100016 started
100017 started
100018 started
100019 started
100020 started
100021 started
100023 started
100024 started
100025 started
100028 started
100029 started
100030 started
100031 started
100033 started
100034 started
100035 started
100036 started
100037 started
100038 started
100039 started
100040 started
100041 started
100044 started
100045 started
100046 started
100047 started
100050 started
100052 started
100053 started
100055 started
100058 started
100059 started
100060 started
100061 started
100062 started
100063 started
100065 started
100066 started
100068 started
100069 started
100071 started
100072 started
100074 started
100075 started
100077 started
100078 started
100079 started
100081 started
100085 started
100087 started
100088 started
100091 started
100094 started
100095 started
100096 started
100098 started
100099 started
100102 started
100103 started
100104 started
100106 started
100108 sta

KeyboardInterrupt: 

In [None]:
# feature = Feature()
# x, z = feature.get_OH()
# import numpy as np
# np.save('xC', x)
# np.save('zC', z)
# print("done")