### Choose the first entry of each patient. Extract blood pressure data into ***Pressure***, create ***Pressure_index*** for each patient. 
Contains patient Blood Pressure data from both vitalPeriodic.csv & nurseCharting.csv

Including: Noninvasivesystolic, Non-Invasive BP Systolic, Invasive BP Systolic, Systemicsystolic

**1. Extract sub-categories patient id from cardiovascular**

In [17]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import data_toolbox

os.chdir('/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/DataExtract/data') # change to your folder path

# import diagnosis.csv
df_diagnosis = pd.read_csv('diagnosis.csv')
df_diagnosis.sort_values(by=['patientunitstayid', 'diagnosisoffset'], inplace=True)

# select cardiovascular patients
df_cardiovascular = df_diagnosis[df_diagnosis['diagnosisstring'].str.contains('cardiovascular')]

# get shock patient 
shock_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('shock')]

# get ventricular patient 
ventricular_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('ventricular')]

# get chest pain patient 
chest_pain_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('chest pain')]

# get arrhythmias patient 
arrhythmias_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('arrhythmias')]

# put id together
df_wanted = pd.concat([shock_patient, ventricular_patient, chest_pain_patient, arrhythmias_patient])
# print(df_wanted)

# Get the patient ids from df_wanted & sort the patient id
# patient_id_all multiple entry patient's stayid
patient_id_all = df_wanted['patientunitstayid'].unique()
patient_id_all.sort()
print(patient_id_all)

[ 143870  151179  151900 ... 3351297 3352230 3352231]


**2. Exclude patient whose unitvisitnumbe>1**

In [18]:
# import patient.csv
df_patient = pd.read_csv('patient.csv')
df_patient.sort_values(by=['patientunitstayid'], inplace=True)
df_patient_buf = df_patient[df_patient['patientunitstayid'].isin(patient_id_all)]
df_1time_patient = df_patient_buf[df_patient_buf['unitvisitnumber']==1]
# print(df_1time_patient)

# select the patient id from df_1time_patient
patient_id = df_1time_patient['patientunitstayid'].unique()
print(f'Total number of patients: {len(patient_id)}')

Total number of patients: 915


**3. Extract data of patients within the id list**

In [19]:
# import vitalPeriodic.csv & nurseCharting.csv
df_vitalPeriodic = pd.read_csv('vitalPeriodic.csv')
df_vitalPeriodic.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
df_nurseCharting = pd.read_csv('nurseCharting.csv')
df_nurseCharting.sort_values(by=['patientunitstayid', 'nursingchartoffset'], inplace=True)
df_vitalAPeriodic = pd.read_csv('vitalAperiodic.csv')
df_vitalAPeriodic.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)


# select the wanted patient
df_vitalPeriodic = df_vitalPeriodic[df_vitalPeriodic['patientunitstayid'].isin(patient_id)]
df_nurseCharting = df_nurseCharting[df_nurseCharting['patientunitstayid'].isin(patient_id)]
df_vitalAPeriodic = df_vitalAPeriodic[df_vitalAPeriodic['patientunitstayid'].isin(patient_id)]

# save the wanted file (uncomment the code to save)
# df_vitalPeriodic.to_csv('vitalPeriodic_wanted.csv', index=False)
# df_nurseCharting.to_csv('nurseCharting_wanted.csv', index=False)
# df_vitalAPeriodic.to_csv('vitalAPeriodic_wanted.csv', index=False)

# print the shape of the wanted file
print(f'vitalperiodic shape: {df_vitalPeriodic.shape}')
print(f'nurseCharting shape: {df_nurseCharting.shape}')
print(f'vitalAPeriodic shape: {df_vitalAPeriodic.shape}')

vitalperiodic shape: (747487, 19)
nurseCharting shape: (603027, 8)
vitalAPeriodic shape: (129156, 13)


**4. Extract Blood Pressure data & create index**


In [20]:
# define heartrate preprocessing function
def normal_heartrate(num):
    """
    Function to normalize heart rate values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if pd.isna(num):
        return num
    # Remove values out of range
    elif num > 300 or num < 0:
        return np.nan
    # Return normal values directly
    else:
        return num
    
def normal_temperature(num):
    """
    Function to normalize temperature values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if pd.isna(num):
        return num
    # Convert Fahrenheit to Celsius
    # And apply the function again
    elif num > 50:
        return normal_temperature((num - 32) * 5 / 9)
    # Remove values out of range
    elif num < 15 or num > 45:
        return np.nan
    # Return normal values directly
    else:
        return num
    
def normal_pa(systolic, diastolic, mean_p):
    """
    Function to normalize pulmonary artery blood pressure values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if systolic == np.nan or diastolic == np.nan or mean_p == np.nan:
        return np.nan, np.nan, np.nan
    # Remove values out of range
    elif systolic < 0 or systolic > 300:
        return np.nan, np.nan, np.nan
    elif diastolic < 0 or diastolic > 200:
        return np.nan, np.nan, np.nan
    elif mean_p < 0 or mean_p > 190:
        return np.nan, np.nan, np.nan
    elif diastolic >= mean_p:
        return np.nan, np.nan, np.nan
    elif systolic < mean_p:
        return np.nan, np.nan, np.nan
    elif systolic - diastolic <= 4:
        return np.nan, np.nan, np.nan
    # Return normal values directly
    else:
        return systolic, diastolic, mean_p

def normal_systemic(systolic, diastolic, mean_p):
    """
    Function to normalize systemic blood pressure values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if systolic == np.nan or diastolic == np.nan or mean_p == np.nan:
        return np.nan, np.nan, np.nan
    # Remove values out of range
    elif systolic < 0 or systolic > 300:
        return np.nan, np.nan, np.nan
    elif diastolic < 0 or diastolic > 200:
        return np.nan, np.nan, np.nan
    elif mean_p < 0 or mean_p > 190:
        return np.nan, np.nan, np.nan
    elif diastolic >= mean_p:
        return np.nan, np.nan, np.nan
    elif systolic < mean_p:
        return np.nan, np.nan, np.nan
    elif systolic - diastolic <= 4:
        return np.nan, np.nan, np.nan
    # Return normal values directly
    else:
        return systolic, diastolic, mean_p

In [21]:
# nursingchartcelltypevallabel Temperature
df_nurseCharting = df_nurseCharting[df_nurseCharting['nursingchartcelltypevalname']=='Non-Invasive BP Systolic']
df_nurseCharting = df_nurseCharting.rename(columns={'nursingchartoffset': 'observationoffset', 'nursingchartvalue':'Pressure'})
print(df_nurseCharting.head())

     nursingchartid  patientunitstayid  observationoffset  \
238       163351477             143870               -424   
280       304056246             143870                -67   
13        144801792             143870                -57   
266       253833644             143870                -32   
182       252655266             143870                -17   

     nursingchartentryoffset nursingchartcelltypecat  \
238                     -424             Vital Signs   
280                      -67             Vital Signs   
13                       -57             Vital Signs   
266                      -32             Vital Signs   
182                      -17             Vital Signs   

    nursingchartcelltypevallabel nursingchartcelltypevalname Pressure  
238              Non-Invasive BP    Non-Invasive BP Systolic      191  
280              Non-Invasive BP    Non-Invasive BP Systolic       96  
13               Non-Invasive BP    Non-Invasive BP Systolic       99  
266     

In [22]:
# extract temperature from df_vitalPeriodic & df_nurseCharting
Temp_v = df_vitalPeriodic[['patientunitstayid', 'observationoffset', 'temperature']]
Temp_n = df_nurseCharting[['patientunitstayid', 'observationoffset', 'temperature']]

# delete the rows with string values
Temp_n = Temp_n[Temp_n['temperature'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
print(f'First 5 rows of Temp_n: \n{Temp_n.head()} \n')

Temp = pd.concat([Temp_v, Temp_n]).astype(float)
Temp.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
print(f'First 5 rows of Temp: \n{Temp.head()} \n')

# exclude abnormal heart rate values
Temp.loc[:, 'temperature'] = Temp['temperature'].apply(normal_temperature)

# save Temp to csv file (uncomment the code to save)
# Temp.to_csv('Temp.csv', index=False)

value_position_dict = {}
first_occurrences = []
for idx, value in enumerate(Temp['patientunitstayid']):
    # if the value is not in the dictionary, add it and create index
    if value not in value_position_dict:
        value_position_dict[value] = idx
        first_occurrences.append(idx)

first_occurrences.append(len(Temp))
# create first occurrence index for every patient
Temp_index = pd.Series(first_occurrences)
print(f'First 5 rows of Temp_index: \n{Temp_index.head()}')

KeyError: "['temperature'] not in index"

**Example: how to use Temp & Temp_index**

In [None]:
# if we want the i th patient's data (i starts from 0)
# use Temp.iloc[Temp_index[i]:Temp_index[i+1]]
i = 0
print(f'HeartRate data for patient {i+1}: \n{Temp.iloc[Temp_index[i]:Temp_index[i+1]]}')

HeartRate data for patient 1: 
     patientunitstayid  observationoffset  temperature
94            143870.0             -424.0    36.111111
119           143870.0             -424.0    36.100000
316           143870.0              -72.0    36.600000
352           143870.0              -72.0    36.611111
273           143870.0              -32.0    36.100000
..                 ...                ...          ...
578           143870.0              782.0          NaN
572           143870.0              787.0          NaN
566           143870.0              792.0          NaN
25            143870.0              973.0    36.111111
41            143870.0              973.0    36.100000

[176 rows x 3 columns]
