### Choose the first entry of each patient. Extract Urine data into ***Urine***, create ***Urine_index*** for each patient. 
Contains patient Urine data from both IntakeOutput.csv

**1. Extract sub-categories patient id from cardiovascular**

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# change to your folder path
# os.chdir('C:/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/EICU Database/eicu-collaborative-research-database-demo-2.0.1')
os.chdir('/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/EICU/eicu-collaborative-research-database-2.0')

# import diagnosis.csv
df_diagnosis = pd.read_csv('diagnosis.csv')
df_diagnosis.sort_values(by=['patientunitstayid', 'diagnosisoffset'], inplace=True)

# select cardiovascular patients
df_cardiovascular = df_diagnosis[df_diagnosis['diagnosisstring'].str.contains('cardiovascular')]

# get shock patient 
shock_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('shock')]

# get ventricular patient 
ventricular_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('ventricular')]

# get chest pain patient 
chest_pain_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('chest pain')]

# get arrhythmias patient 
arrhythmias_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('arrhythmias')]

# put id together
df_wanted = pd.concat([shock_patient, ventricular_patient, chest_pain_patient, arrhythmias_patient])

# Get the patient ids from df_wanted & sort the patient id
# patient_id_all multiple entry patient's stayid
patient_id_all = df_wanted['patientunitstayid'].unique()
patient_id_all.sort()
print(patient_id_all)

[ 141168  141203  141227 ... 3353216 3353235 3353251]


**2. Exclude patient whose unitvisitnumbe>1**

In [2]:
# import patient.csv
df_patient = pd.read_csv('patient.csv')
df_patient.sort_values(by=['patientunitstayid'], inplace=True)
df_patient_buf = df_patient[df_patient['patientunitstayid'].isin(patient_id_all)]
df_1time_patient = df_patient_buf[df_patient_buf['unitvisitnumber']==1]
# print(df_1time_patient)

# select the patient id from df_1time_patient
patient_id = df_1time_patient['patientunitstayid'].unique()
print(f'Total number of patients: {len(patient_id)}')

Total number of patients: 71353


**3. Extract data of patients within the id list**

In [3]:
# import intakeOutput.csv
df_intakeOutput = pd.read_csv("intakeOutput.csv")
df_intakeOutput.sort_values(
    by=["patientunitstayid", "intakeoutputoffset"], inplace=True
)

# select the wanted patient
df_intakeOutput = df_intakeOutput[df_intakeOutput["patientunitstayid"].isin(patient_id)]

# print the shape of the wanted file
print(f"intaekOutput shape: {df_intakeOutput.shape}")

intaekOutput shape: (5352427, 12)


**4. Extract Urine Output data & create index**

In [4]:
# choose only intakeOutput Urine cell label
df_UrineOutput = df_intakeOutput[df_intakeOutput['celllabel']=='Urine']
df_UrineOutput = df_UrineOutput.rename(columns={'cellvaluenumeric':'UrineOutput'})

print(df_UrineOutput.head())

        intakeoutputid  patientunitstayid  intakeoutputoffset  intaketotal  \
171912        10876745             141227               -1893          0.0   
512473        13982221             141227               -1773          0.0   
181072        10961092             141227               -1663          0.0   
544874        14279776             141227               -1351          0.0   
49487          9759089             141229                3430          0.0   

        outputtotal  dialysistotal  nettotal  intakeoutputentryoffset  \
171912        200.0            0.0    -200.0                    -1893   
512473        200.0            0.0    -200.0                    -1773   
181072        200.0            0.0    -200.0                    -1663   
544874        200.0            0.0    -200.0                    -1351   
49487         400.0            0.0    -400.0                     3430   

                                                 cellpath celllabel  \
171912  flowsheet|Flo

In [5]:
# extract Urine data from intakeOutput.csv
Urine = df_UrineOutput[['patientunitstayid', 'intakeoutputoffset', 'UrineOutput']].copy()
Urine.sort_values(by=['patientunitstayid', 'intakeoutputoffset'], inplace=True)

value_position_dict = {}
first_occurrences = []
for idx, value in enumerate(Urine['patientunitstayid']):
    # if the value is not in the dictionary, add it and create index
    if value not in value_position_dict:
        value_position_dict[value] = idx
        first_occurrences.append(idx)

first_occurrences.append(len(Urine))
# create first occurrence index for every patient
Urine_index = pd.Series(first_occurrences)
print(f'First 5 rows of Urine_index: \n{Urine_index.head()}')

First 5 rows of Urine_index: 
0     0
1     4
2     5
3    14
4    15
dtype: int64


**Example: how to use Urine & Urine_index**

In [6]:
# if we want the i th patient's data (i starts from 0)
# use Urine.iloc[Urine_index[i]:Urine_index[i+1]]
i = 1
print(f'Urine Output data for patient {i+1}: \n{Urine.iloc[Urine_index[i]:Urine_index[i+10]]}')

Urine Output data for patient 2: 
        patientunitstayid  intakeoutputoffset  UrineOutput
49487              141229                3430        400.0
631164             141266                1697        450.0
439820             141266                5839        125.0
364087             141266                6664        150.0
676971             141266                7559        100.0
...                   ...                 ...          ...
146978             141462                9512        210.0
838909             141462                9607        200.0
353334             141462               10622        200.0
841468             141462               10712        375.0
217107             141462               10892        300.0

[117 rows x 3 columns]


In [7]:
# 第一次进入ICU的心血管疾病患者ID集合
first_time_icu_patients = set(patient_id)
print(f'First time ICU patients: {len(first_time_icu_patients)}')

# 有Urine数据的心血管疾病患者ID集合
Urine_patients = set(Urine['patientunitstayid'].unique())
print(f'Patients with Urine data: {len(Urine_patients)}')


First time ICU patients: 71353
Patients with Urine data: 55414
