### Choose the first entry of each patient. Extract heart rate data into ***HR***, create ***HR_index*** for each patient. 
Contains patient heart rate data from both vitalPeriodic.csv & nurseCharting.csv

**1. Extract sub-categories patient id from cardiovascular**

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import data_toolbox

os.chdir('/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/DataExtract/data') # change to your folder path

# import diagnosis.csv
df_diagnosis = pd.read_csv('diagnosis.csv')
df_diagnosis.sort_values(by=['patientunitstayid', 'diagnosisoffset'], inplace=True)

# select cardiovascular patients
df_cardiovascular = df_diagnosis[df_diagnosis['diagnosisstring'].str.contains('cardiovascular')]
# print(df_cardiovascular)


# get shock patient 
shock_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('shock')]
# print(shock_patient)

# get ventricular patient 
ventricular_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('ventricular')]
# print(ventricular_patient)

# get chest pain patient 
chest_pain_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('chest pain')]

# get arrhythmias patient 
arrhythmias_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('arrhythmias')]

# put id together
df_wanted = pd.concat([shock_patient, ventricular_patient, chest_pain_patient, arrhythmias_patient])
# print(df_wanted)

# Get the patient ids from df_wanted & sort the patient id
# patient_id_all multiple entry patient's stayid
patient_id_all = df_wanted['patientunitstayid'].unique()
patient_id_all.sort()
print(patient_id_all)

[ 143870  151179  151900 ... 3351297 3352230 3352231]


**2. Exclude patient whose unitvisitnumbe>1**

In [2]:
# import patient.csv
df_patient = pd.read_csv('patient.csv')
df_patient.sort_values(by=['patientunitstayid'], inplace=True)
df_patient_buf = df_patient[df_patient['patientunitstayid'].isin(patient_id_all)]
df_1time_patient = df_patient_buf[df_patient_buf['unitvisitnumber']==1]
# print(df_1time_patient)

# select the patient id from df_1time_patient
patient_id = df_1time_patient['patientunitstayid'].unique()
print(f'Total number of patients: {len(patient_id)}')

Total number of patients: 915


**3. Extract data of patients within the id list**

In [3]:
# import vitalPeriodic.csv & nurseCharting.csv
df_vitalPeriodic = pd.read_csv('vitalPeriodic.csv')
df_vitalPeriodic.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
df_nurseCharting = pd.read_csv('nurseCharting.csv')
df_nurseCharting.sort_values(by=['patientunitstayid', 'nursingchartoffset'], inplace=True)


# select the wanted patient
df_vitalPeriodic = df_vitalPeriodic[df_vitalPeriodic['patientunitstayid'].isin(patient_id)]
df_nurseCharting = df_nurseCharting[df_nurseCharting['patientunitstayid'].isin(patient_id)]

# save the wanted file (uncomment the code to save)
# df_vitalPeriodic.to_csv('vitalPeriodic_wanted.csv', index=False)

# print the shape of the wanted file
print(f'vitalperiodic shape: {df_vitalPeriodic.shape}')
print(f'nurseCharting shape: {df_nurseCharting.shape}')

vitalperiodic shape: (747487, 19)
nurseCharting shape: (603027, 8)


**4. Extract Heart Rate data & create index**

It is weird that some patient id doesn't exist in df_vitalPeriodic, maybe it's a demo problem

In [4]:
# value1 = set(df_vitalPeriodic['patientunitstayid'].unique())
# value2 = set(patient_id)
# unique_to_2 = value2.difference(value1)
# print(unique_to_2)

In [5]:
# define heartrate preprocessing function
def normal_heartrate(num):
    """
    Function to normalize heart rate values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if pd.isna(num):
        return num
    # Remove values out of range
    elif num > 300 or num < 0:
        return np.nan
    # Return normal values directly
    else:
        return num

In [6]:
# nursingchartcelltypevallabel Heart Rate
df_nurseCharting = df_nurseCharting[df_nurseCharting['nursingchartcelltypevallabel']=='Heart Rate']
df_nurseCharting = df_nurseCharting.rename(columns={'nursingchartoffset': 'observationoffset', 'nursingchartvalue':'heartrate'})
print(df_nurseCharting.head())

     nursingchartid  patientunitstayid  observationoffset  \
130       200873835             143870               -424   
155        91951972             143870                -67   
346       146117804             143870                -57   
344       146377143             143870                -17   
169       182492893             143870                 -2   

     nursingchartentryoffset nursingchartcelltypecat  \
130                     -424             Vital Signs   
155                      -67             Vital Signs   
346                      -57             Vital Signs   
344                      -17             Vital Signs   
169                       -2             Vital Signs   

    nursingchartcelltypevallabel nursingchartcelltypevalname heartrate  
130                   Heart Rate                  Heart Rate        53  
155                   Heart Rate                  Heart Rate        55  
346                   Heart Rate                  Heart Rate        49  
344 

In [7]:
# extract heart rate from df_vitalPeriodic & df_nurseCharting
HR_v = df_vitalPeriodic[['patientunitstayid', 'observationoffset', 'heartrate']]
HR_n = df_nurseCharting[['patientunitstayid', 'observationoffset', 'heartrate']]
HR = pd.concat([HR_v, HR_n]).astype(float)
HR.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
print(f'First 5 rows of HR: \n{HR.head()} \n')

# exclude abnormal heart rate values
HR.loc[:, 'heartrate'] = HR['heartrate'].apply(normal_heartrate)

# save HR to csv file (uncomment the code to save)
# HR.to_csv('HR.csv', index=False)

value_position_dict = {}
first_occurrences = []
for idx, value in enumerate(HR['patientunitstayid']):
    # if the value is not in the dictionary, add it and create index
    if value not in value_position_dict:
        value_position_dict[value] = idx
        first_occurrences.append(idx)

first_occurrences.append(len(HR))
# create first occurrence index for every patient
HR_index = pd.Series(first_occurrences)
print(f'First 5 rows of HR_index: \n{HR_index.head()}')

# double check the index is correct
# print(HR.iloc[HR_index].head())
# print(HR.iloc[[156, 157, 158, 159]])
# print(HR.iloc[[1015, 1016, 1017, 1018]])

First 5 rows of HR: 
     patientunitstayid  observationoffset  heartrate
130           143870.0             -424.0       53.0
155           143870.0              -67.0       55.0
346           143870.0              -57.0       49.0
344           143870.0              -17.0       51.0
169           143870.0               -2.0       44.0 

First 5 rows of HR_index: 
0       0
1     185
2    1365
3    2201
4    3071
dtype: int64


**Example: how to use HR & HR_index**

In [8]:
# if we want the i th patient's data (i starts from 0)
# use HR.iloc[HR_index[i]:HR_index[i+1]]
i = 0
print(f'HeartRate data for patient {i+1}: \n{HR.iloc[HR_index[i]:HR_index[i+1]]}')

HeartRate data for patient 1: 
     patientunitstayid  observationoffset  heartrate
130           143870.0             -424.0       53.0
155           143870.0              -67.0       55.0
346           143870.0              -57.0       49.0
344           143870.0              -17.0       51.0
169           143870.0               -2.0       44.0
..                 ...                ...        ...
566           143870.0              792.0       49.0
106           143870.0              793.0       49.0
219           143870.0              853.0       47.0
120           143870.0              973.0       47.0
315           143870.0             1108.0       45.0

[185 rows x 3 columns]
