### Choose the first entry of each patient. Extract heart rate data into ***HR***, create ***HR_index*** for each patient. 

**1. Extract sub-categories patient id from cardiovascular**

In [111]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import data_toolbox

# import diagnosis.csv
df_diagnosis = pd.read_csv('diagnosis.csv')
df_diagnosis.sort_values(by=['patientunitstayid', 'diagnosisoffset'], inplace=True)

# select cardiovascular patients
df_cardiovascular = df_diagnosis[df_diagnosis['diagnosisstring'].str.contains('cardiovascular')]
# print(df_cardiovascular)


# get shock patient 
shock_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('shock')]
# print(shock_patient)

# get ventricular patient 
ventricular_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('ventricular')]
# print(ventricular_patient)

# get chest pain patient 
chest_pain_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('chest pain')]

# get arrhythmias patient 
arrhythmias_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('arrhythmias')]

# put id together
df_wanted = pd.concat([shock_patient, ventricular_patient, chest_pain_patient, arrhythmias_patient])
# print(df_wanted)

# Get the patient ids from df_wanted & sort the patient id
# patient_id_all包含重复入院的stayid
patient_id_all = df_wanted['patientunitstayid'].unique()
patient_id_all.sort()
print(patient_id_all)

[ 143870  151179  151900 ... 3351297 3352230 3352231]


**2. Exclude patient whose unitvisitnumbe>1**

In [112]:
# import patient.csv
df_patient = pd.read_csv('patient.csv')
df_patient.sort_values(by=['patientunitstayid'], inplace=True)
df_patient_buf = df_patient[df_patient['patientunitstayid'].isin(patient_id_all)]
df_1time_patient = df_patient_buf[df_patient_buf['unitvisitnumber']==1]
# print(df_1time_patient)

# select the patient id from df_1time_patient
patient_id = df_1time_patient['patientunitstayid'].unique()
print(f'Total number of patients: {len(patient_id)}')

Total number of patients: 915


**3. Extract data of patients within the id list**

In [113]:
# import vitalPeriodic.csv
df_vitalPeriodic = pd.read_csv('vitalPeriodic.csv')
df_vitalPeriodic.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)

# select the wanted patient
df_vitalPeriodic = df_vitalPeriodic[df_vitalPeriodic['patientunitstayid'].isin(patient_id)]

# save the wanted file (uncomment the code to save)
# df_vitalPeriodic.to_csv('vitalPeriodic_wanted.csv', index=False)

# print the shape of the wanted file
print(df_vitalPeriodic.head())

     vitalperiodicid  patientunitstayid  observationoffset  temperature  sao2  \
628         47431476             143870                  7          NaN  98.0   
574         47431336             143870                 12          NaN  99.0   
543         47431195             143870                 17          NaN  98.0   
580         47431054             143870                 22          NaN  98.0   
519         47430916             143870                 27          NaN  98.0   

     heartrate  respiration  cvp  etco2  systemicsystolic  systemicdiastolic  \
628       44.0         86.0  NaN    NaN             111.0               38.0   
574       42.0         75.0  NaN    NaN             114.0               37.0   
543       41.0         78.0  NaN    NaN             113.0               37.0   
580       41.0         73.0  NaN    NaN             113.0               37.0   
519       41.0         69.0  NaN    NaN             111.0               36.0   

     systemicmean  pasystolic  p

**4. Extract Heart Rate data & create index**

It is weird that some patient id doesn't exist in df_vitalPeriodic, maybe it's a demo problem

In [114]:
# value1 = set(df_vitalPeriodic['patientunitstayid'].unique())
# value2 = set(patient_id)
# unique_to_2 = value2.difference(value1)
# print(unique_to_2)

In [115]:
# define heartrate preprocessing function
def normal_heartrate(num):
    """
    Function to normalize heart rate values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if pd.isna(num):
        return num
    # Remove values out of range
    elif num > 300 or num < 0:
        return np.nan
    # Return normal values directly
    else:
        return num

In [116]:
# extract heart rate from df_vitalPeriodic
HR = df_vitalPeriodic[['patientunitstayid', 'observationoffset', 'heartrate']]
print(f'First 5 rows of HR: \n{HR.head()}')

# exclude abnormal heart rate values
HR.loc[:, 'heartrate'] = HR['heartrate'].apply(normal_heartrate)

# save HR to csv file (uncomment the code to save)
# HR.to_csv('HR.csv', index=False)

value_position_dict = {}
first_occurrences = []
for idx, value in enumerate(HR['patientunitstayid']):
    # if the value is not in the dictionary, add it and create index
    if value not in value_position_dict:
        value_position_dict[value] = idx
        first_occurrences.append(idx)

first_occurrences.append(len(HR))
# create first occurrence index for every patient
HR_index = pd.Series(first_occurrences)
print(f'First 5 rows of HR_index: \n{HR_index.head()}')

# double check the index is correct
# print(HR.iloc[HR_index].head())
# print(HR.iloc[[156, 157, 158, 159]])
# print(HR.iloc[[1015, 1016, 1017, 1018]])

First 5 rows of HR: 
     patientunitstayid  observationoffset  heartrate
628             143870                  7       44.0
574             143870                 12       42.0
543             143870                 17       41.0
580             143870                 22       41.0
519             143870                 27       41.0
First 5 rows of HR_index: 
0       0
1     158
2    1017
3    1708
4    2501
dtype: int64


**Example: how to use HR & HR_index**

In [122]:
# if we want the i th patient's data (i starts from 0)
# use HR.iloc[HR_index[i]:HR_index[i+1]]
i = 0
print(f'HeartRate data for patient {i+1}: \n{HR.iloc[HR_index[i]:HR_index[i+1]]}')

HeartRate data for patient 1: 
     patientunitstayid  observationoffset  heartrate
628             143870                  7       44.0
574             143870                 12       42.0
543             143870                 17       41.0
580             143870                 22       41.0
519             143870                 27       41.0
..                 ...                ...        ...
614             143870                772       50.0
584             143870                777       51.0
578             143870                782       48.0
572             143870                787       48.0
566             143870                792       49.0

[158 rows x 3 columns]
