### Choose the first entry of each patient. Extract body temperature data into ***Temp***, create ***Temp_index*** for each patient. 
Contains patient body temperature data from both vitalPeriodic.csv & nurseCharting.csv

**1. Extract sub-categories patient id from cardiovascular**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import data_toolbox

# import diagnosis.csv
df_diagnosis = pd.read_csv('diagnosis.csv')
df_diagnosis.sort_values(by=['patientunitstayid', 'diagnosisoffset'], inplace=True)

# select cardiovascular patients
df_cardiovascular = df_diagnosis[df_diagnosis['diagnosisstring'].str.contains('cardiovascular')]
# print(df_cardiovascular)


# get shock patient 
shock_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('shock')]
# print(shock_patient)

# get ventricular patient 
ventricular_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('ventricular')]
# print(ventricular_patient)

# get chest pain patient 
chest_pain_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('chest pain')]

# get arrhythmias patient 
arrhythmias_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('arrhythmias')]

# put id together
df_wanted = pd.concat([shock_patient, ventricular_patient, chest_pain_patient, arrhythmias_patient])
# print(df_wanted)

# Get the patient ids from df_wanted & sort the patient id
# patient_id_all multiple entry patient's stayid
patient_id_all = df_wanted['patientunitstayid'].unique()
patient_id_all.sort()
print(patient_id_all)

[ 143870  151179  151900 ... 3351297 3352230 3352231]


**2. Exclude patient whose unitvisitnumbe>1**

In [2]:
# import patient.csv
df_patient = pd.read_csv('patient.csv')
df_patient.sort_values(by=['patientunitstayid'], inplace=True)
df_patient_buf = df_patient[df_patient['patientunitstayid'].isin(patient_id_all)]
df_1time_patient = df_patient_buf[df_patient_buf['unitvisitnumber']==1]
# print(df_1time_patient)

# select the patient id from df_1time_patient
patient_id = df_1time_patient['patientunitstayid'].unique()
print(f'Total number of patients: {len(patient_id)}')

Total number of patients: 915


**3. Extract data of patients within the id list**

In [3]:
# import vitalPeriodic.csv & nurseCharting.csv
df_vitalPeriodic = pd.read_csv('vitalPeriodic.csv')
df_vitalPeriodic.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
df_nurseCharting = pd.read_csv('nurseCharting.csv')
df_nurseCharting.sort_values(by=['patientunitstayid', 'nursingchartoffset'], inplace=True)


# select the wanted patient
df_vitalPeriodic = df_vitalPeriodic[df_vitalPeriodic['patientunitstayid'].isin(patient_id)]
df_nurseCharting = df_nurseCharting[df_nurseCharting['patientunitstayid'].isin(patient_id)]

# save the wanted file (uncomment the code to save)
# df_vitalPeriodic.to_csv('vitalPeriodic_wanted.csv', index=False)

# print the shape of the wanted file
print(f'vitalperiodic shape: {df_vitalPeriodic.shape}')
print(f'nurseCharting shape: {df_nurseCharting.shape}')

vitalperiodic shape: (747487, 19)
nurseCharting shape: (603027, 8)


**4. Extract Heart Rate data & create index**

It is weird that some patient id doesn't exist in df_vitalPeriodic, maybe it's a demo problem

In [4]:
# value1 = set(df_vitalPeriodic['patientunitstayid'].unique())
# value2 = set(patient_id)
# unique_to_2 = value2.difference(value1)
# print(unique_to_2)

In [5]:
# define heartrate preprocessing function
def normal_heartrate(num):
    """
    Function to normalize heart rate values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if pd.isna(num):
        return num
    # Remove values out of range
    elif num > 300 or num < 0:
        return np.nan
    # Return normal values directly
    else:
        return num
    
def normal_temperature(num):
    """
    Function to normalize temperature values.

    Parameters:
        num: the originial input value
    Return:
        num: the normalized output value
    """
    # Return null values direcly
    if pd.isna(num):
        return num
    # Convert Fahrenheit to Celsius
    # And apply the function again
    elif num > 50:
        return normal_temperature((num - 32) * 5 / 9)
    # Remove values out of range
    elif num < 15 or num > 45:
        return np.nan
    # Return normal values directly
    else:
        return num

In [6]:
# nursingchartcelltypevallabel Temperature
df_nurseCharting = df_nurseCharting[df_nurseCharting['nursingchartcelltypevallabel']=='Temperature']
df_nurseCharting = df_nurseCharting.rename(columns={'nursingchartoffset': 'observationoffset', 'nursingchartvalue':'temperature'})
print(df_nurseCharting.head())

     nursingchartid  patientunitstayid  observationoffset  \
58         95478972             143870               -424   
94         95478973             143870               -424   
119        95478971             143870               -424   
316       240209719             143870                -72   
343       240209720             143870                -72   

     nursingchartentryoffset nursingchartcelltypecat  \
58                      -424             Vital Signs   
94                      -424             Vital Signs   
119                     -424             Vital Signs   
316                      -72             Vital Signs   
343                      -72             Vital Signs   

    nursingchartcelltypevallabel nursingchartcelltypevalname      temperature  
58                   Temperature        Temperature Location  TEMPORAL ARTERY  
94                   Temperature             Temperature (F)               97  
119                  Temperature             Temperature

In [12]:
# extract temperature from df_vitalPeriodic & df_nurseCharting
Temp_v = df_vitalPeriodic[['patientunitstayid', 'observationoffset', 'temperature']]
Temp_n = df_nurseCharting[['patientunitstayid', 'observationoffset', 'temperature']]

# delete the rows with string values
Temp_n = Temp_n[Temp_n['temperature'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
print(f'First 5 rows of Temp_n: \n{Temp_n.head()} \n')

Temp = pd.concat([Temp_v, Temp_n]).astype(float)
Temp.sort_values(by=['patientunitstayid', 'observationoffset'], inplace=True)
print(f'First 5 rows of Temp: \n{Temp.head()} \n')

# exclude abnormal heart rate values
Temp.loc[:, 'temperature'] = Temp['temperature'].apply(normal_temperature)

# save Temp to csv file (uncomment the code to save)
# Temp.to_csv('Temp.csv', index=False)

value_position_dict = {}
first_occurrences = []
for idx, value in enumerate(Temp['patientunitstayid']):
    # if the value is not in the dictionary, add it and create index
    if value not in value_position_dict:
        value_position_dict[value] = idx
        first_occurrences.append(idx)

first_occurrences.append(len(Temp))
# create first occurrence index for every patient
Temp_index = pd.Series(first_occurrences)
print(f'First 5 rows of Temp_index: \n{Temp_index.head()}')

First 5 rows of Temp_n: 
     patientunitstayid  observationoffset temperature
94              143870               -424          97
119             143870               -424        36.1
316             143870                -72        36.6
352             143870                -72        97.9
273             143870                -32        36.1 

First 5 rows of Temp: 
     patientunitstayid  observationoffset  temperature
94            143870.0             -424.0         97.0
119           143870.0             -424.0         36.1
316           143870.0              -72.0         36.6
352           143870.0              -72.0         97.9
273           143870.0              -32.0         36.1 

First 5 rows of Temp_index: 
0       0
1     176
2    1167
3    1902
4    2739
dtype: int64


**Example: how to use Temp & Temp_index**

In [None]:
# if we want the i th patient's data (i starts from 0)
# use Temp.iloc[Temp_index[i]:Temp_index[i+1]]
i = 0
print(f'HeartRate data for patient {i+1}: \n{Temp.iloc[Temp_index[i]:Temp_index[i+1]]}')