### Choose the first entry of each patient. Extract body temperature data into ***Temp***, create ***Temp_index*** for each patient. 
Contains patient body temperature data from both vitalPeriodic.csv & nurseCharting.csv

**1. Extract sub-categories patient id from cardiovascular**

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# change to your folder path
# os.chdir('C:/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/EICU Database/eicu-collaborative-research-database-demo-2.0.1')
# os.chdir('/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/DataExtract/data') 
os.chdir('/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/EICU/eicu-collaborative-research-database-2.0')

# import diagnosis.csv
df_diagnosis = pd.read_csv('diagnosis.csv')
df_diagnosis.sort_values(by=['patientunitstayid', 'diagnosisoffset'], inplace=True)

# select cardiovascular patients
df_cardiovascular = df_diagnosis[df_diagnosis['diagnosisstring'].str.contains('cardiovascular')]
# print(df_cardiovascular)


# get shock patient 
shock_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('shock')]
# print(shock_patient)

# get ventricular patient 
ventricular_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('ventricular')]
# print(ventricular_patient)

# get chest pain patient 
chest_pain_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('chest pain')]

# get arrhythmias patient 
arrhythmias_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('arrhythmias')]

# put id together
df_wanted = pd.concat([shock_patient, ventricular_patient, chest_pain_patient, arrhythmias_patient])
# print(df_wanted)

# Get the patient ids from df_wanted & sort the patient id
# patient_id_all multiple entry patient's stayid
patient_id_all = df_wanted['patientunitstayid'].unique()
patient_id_all.sort()
print(patient_id_all)

[ 141168  141203  141227 ... 3353216 3353235 3353251]


**2. Exclude patient whose unitvisitnumbe>1**

In [3]:
# import patient.csv
df_patient = pd.read_csv('patient.csv')
df_patient.sort_values(by=['patientunitstayid'], inplace=True)
df_patient_buf = df_patient[df_patient['patientunitstayid'].isin(patient_id_all)]
df_1time_patient = df_patient_buf[df_patient_buf['unitvisitnumber']==1]
# print(df_1time_patient)

# select the patient id from df_1time_patient
patient_id = df_1time_patient['patientunitstayid'].unique()
print(f'Total number of patients: {len(patient_id)}')

Total number of patients: 71353


**3. Extract data of patients within the id list**

In [4]:
# import vitalPeriodic.csv & nurseCharting.csv
df_vitalPeriodic = pd.read_csv('vitalPeriodic.csv')
df_vitalPeriodic.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
df_nurseCharting = pd.read_csv('nurseCharting.csv')
df_nurseCharting.sort_values(by=['patientunitstayid', 'nursingchartoffset'], inplace=True)


# select the wanted patient
df_vitalPeriodic = df_vitalPeriodic[df_vitalPeriodic['patientunitstayid'].isin(patient_id)]
df_nurseCharting = df_nurseCharting[df_nurseCharting['patientunitstayid'].isin(patient_id)]

# save the wanted file (uncomment the code to save)
# df_vitalPeriodic.to_csv('vitalPeriodic_wanted.csv', index=False)

# print the shape of the wanted file
print(f'vitalperiodic shape: {df_vitalPeriodic.shape}')
print(f'nurseCharting shape: {df_nurseCharting.shape}')

vitalperiodic shape: (63195275, 19)
nurseCharting shape: (61929777, 8)


**4. Extract Heart Rate data & create index**

In [5]:
# define temperature transformation function
def normal_temperature(num):
    """
    Function to normalize temperature values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if pd.isna(num):
        return num
    # Convert Fahrenheit to Celsius
    # And apply the function again
    elif num > 50:
        return normal_temperature((num - 32) * 5 / 9)
    # Remove values out of range
    elif num < 15 or num > 45:
        return np.nan
    # Return normal values directly
    else:
        return num

In [6]:
# nursingchartcelltypevallabel Temperature
df_nurseCharting = df_nurseCharting[df_nurseCharting['nursingchartcelltypevallabel']=='Temperature']
df_nurseCharting = df_nurseCharting.rename(columns={'nursingchartoffset': 'observationoffset', 'nursingchartvalue':'temperature'})
print(df_nurseCharting.head())

           nursingchartid  patientunitstayid  observationoffset  \
151470743       222080627             141168                351   
151470759       222080625             141168                351   
151470766       222080626             141168                351   
151470826       257836794             141168                563   
151470832       257836793             141168                563   

           nursingchartentryoffset nursingchartcelltypecat  \
151470743                      351             Vital Signs   
151470759                      351             Vital Signs   
151470766                      351             Vital Signs   
151470826                      563             Vital Signs   
151470832                      563             Vital Signs   

          nursingchartcelltypevallabel nursingchartcelltypevalname  \
151470743                  Temperature             Temperature (F)   
151470759                  Temperature             Temperature (C)   
151470766     

In [7]:
# extract temperature from df_vitalPeriodic & df_nurseCharting
Temp_v = df_vitalPeriodic[['patientunitstayid', 'observationoffset', 'temperature']]
Temp_n = df_nurseCharting[['patientunitstayid', 'observationoffset', 'temperature']]

# delete the rows with string values
Temp_n = Temp_n[Temp_n['temperature'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
print(f'First 5 rows of Temp_n: \n{Temp_n.head()} \n')

Temp = pd.concat([Temp_v, Temp_n]).astype(float)
Temp.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
print(f'First 5 rows of Temp: \n{Temp.head()} \n')

# exclude abnormal heart rate values
Temp.loc[:, 'temperature'] = Temp['temperature'].apply(normal_temperature)

# save Temp to csv file (uncomment the code to save)
# Temp.to_csv('Temp.csv', index=False)

value_position_dict = {}
first_occurrences = []
for idx, value in enumerate(Temp['patientunitstayid']):
    # if the value is not in the dictionary, add it and create index
    if value not in value_position_dict:
        value_position_dict[value] = idx
        first_occurrences.append(idx)

first_occurrences.append(len(Temp))
# create first occurrence index for every patient
Temp_index = pd.Series(first_occurrences)
print(f'First 5 rows of Temp_index: \n{Temp_index.head()}')

First 5 rows of Temp_n: 
           patientunitstayid  observationoffset temperature
151470743             141168                351          97
151470759             141168                351        36.1
151470826             141168                563        98.1
151470833             141168                563        36.7
151470659             141168                808        36.8 

First 5 rows of Temp: 
     patientunitstayid  observationoffset  temperature
15            141168.0              119.0          NaN
201           141168.0              124.0          NaN
213           141168.0              129.0          NaN
415           141168.0              134.0          NaN
177           141168.0              139.0          NaN 

First 5 rows of Temp_index: 
0       0
1     435
2     840
3    1216
4    1269
dtype: int64


**Example: how to use Temp & Temp_index**

In [8]:
# if we want the i th patient's data (i starts from 0)
# use Temp.iloc[Temp_index[i]:Temp_index[i+1]]
i = 0
print(f'HeartRate data for patient {i+1}: \n{Temp.iloc[Temp_index[i]:Temp_index[i+1]]}')

HeartRate data for patient 1: 
     patientunitstayid  observationoffset  temperature
15            141168.0              119.0          NaN
201           141168.0              124.0          NaN
213           141168.0              129.0          NaN
415           141168.0              134.0          NaN
177           141168.0              139.0          NaN
..                 ...                ...          ...
371           141168.0             2294.0          NaN
407           141168.0             2299.0          NaN
125           141168.0             2304.0          NaN
101           141168.0             2309.0          NaN
203           141168.0             2314.0          NaN

[435 rows x 3 columns]


In [10]:
# 第一次进入ICU的心血管疾病患者ID集合
first_time_icu_patients = set(patient_id)
print(f'First time ICU patients: {len(first_time_icu_patients)}')

# 有温度数据的患者ID集合
patients_with_temperature = set(Temp['patientunitstayid'].unique())
print(f'Patients with temperature: {len(patients_with_temperature)}')

# 没有温度数据的患者ID集合
patients_without_temperature = first_time_icu_patients - patients_with_temperature
print(f'Patients without temperature: {len(patients_without_temperature)}')
print(f'Patients without temperature: \n {patients_without_temperature}')

First time ICU patients: 71353
Patients with temperature: 71263
Patients without temperature: 90
Patients without temperature: 
 {2801153, 1447941, 1680905, 3137038, 3139088, 3135506, 1045022, 3043878, 1328169, 3102249, 3141677, 1042499, 3150922, 2933841, 3131989, 1848412, 1760355, 1244779, 3047020, 3154544, 1746547, 3098742, 399992, 1666682, 3154053, 986248, 1074314, 963212, 369297, 3092117, 3064492, 3034287, 1078979, 3081425, 356563, 838356, 381141, 372953, 272096, 388324, 1349865, 240876, 1836781, 3142894, 849647, 1594612, 1648374, 1814272, 1010951, 3038473, 1854743, 3034911, 975144, 3141433, 1784634, 3126092, 3117901, 3134285, 2301278, 1642345, 761196, 2883956, 438647, 1663354, 2707327, 3136390, 1815431, 1050505, 3089801, 1719695, 3148181, 935324, 3171236, 3156909, 3102646, 3336132, 3058631, 3043273, 3092939, 1847757, 1839057, 784853, 1609180, 3072991, 3155425, 816098, 3134446, 3062255, 1249263, 1074163}
