### Choose the first entry of each patient. Extract Urine data into ***Urine***, create ***Urine_index*** for each patient. 
Contains patient Urine data from both IntakeOutput.csv

**1. Extract sub-categories patient id from cardiovascular**

In [12]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# change to your folder path
os.chdir('/Users/xiao-zy19/Desktop/Johns Hopkins/Biomedical Data Design/DataExtract/data') 

# import diagnosis.csv
df_diagnosis = pd.read_csv('diagnosis.csv')
df_diagnosis.sort_values(by=['patientunitstayid', 'diagnosisoffset'], inplace=True)

# select cardiovascular patients
df_cardiovascular = df_diagnosis[df_diagnosis['diagnosisstring'].str.contains('cardiovascular')]

# get shock patient 
shock_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('shock')]

# get ventricular patient 
ventricular_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('ventricular')]

# get chest pain patient 
chest_pain_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('chest pain')]

# get arrhythmias patient 
arrhythmias_patient = df_cardiovascular[df_cardiovascular['diagnosisstring'].str.contains('arrhythmias')]

# put id together
df_wanted = pd.concat([shock_patient, ventricular_patient, chest_pain_patient, arrhythmias_patient])

# Get the patient ids from df_wanted & sort the patient id
# patient_id_all multiple entry patient's stayid
patient_id_all = df_wanted['patientunitstayid'].unique()
patient_id_all.sort()
print(patient_id_all)

[ 143870  151179  151900 ... 3351297 3352230 3352231]


**2. Exclude patient whose unitvisitnumbe>1**

In [13]:
# import patient.csv
df_patient = pd.read_csv('patient.csv')
df_patient.sort_values(by=['patientunitstayid'], inplace=True)
df_patient_buf = df_patient[df_patient['patientunitstayid'].isin(patient_id_all)]
df_1time_patient = df_patient_buf[df_patient_buf['unitvisitnumber']==1]
# print(df_1time_patient)

# select the patient id from df_1time_patient
patient_id = df_1time_patient['patientunitstayid'].unique()
print(f'Total number of patients: {len(patient_id)}')

Total number of patients: 915


**3. Extract data of patients within the id list**

In [14]:
# import intakeOutput.csv
df_intakeOutput = pd.read_csv("intakeOutput.csv")
df_intakeOutput.sort_values(
    by=["patientunitstayid", "intakeoutputoffset"], inplace=True
)

# select the wanted patient
df_intakeOutput = df_intakeOutput[df_intakeOutput["patientunitstayid"].isin(patient_id)]

# print the shape of the wanted file
print(f"intaekOutput shape: {df_intakeOutput.shape}")

intaekOutput shape: (38176, 12)


**4. Extract Urine Output data & create index**

In [15]:
# choose only intakeOutput Urine cell label
df_UrineOutput = df_intakeOutput[df_intakeOutput['celllabel']=='Urine']
df_UrineOutput = df_UrineOutput.rename(columns={'cellvaluenumeric':'UrineOutput'})

print(df_UrineOutput.head())

      intakeoutputid  patientunitstayid  intakeoutputoffset  intaketotal  \
4636        15608121             143870                1183          0.0   
5263        16475633             151179                6297          0.0   
5696        17106631             151179                6522          0.0   
5248        16437793             151900                1115          0.0   
1281        11065612             151900                2435        595.0   

      outputtotal  dialysistotal  nettotal  intakeoutputentryoffset  \
4636        300.0            0.0    -300.0                     1183   
5263          0.0            0.0       0.0                     6297   
5696          0.0            0.0       0.0                     6522   
5248        850.0            0.0    -850.0                     1115   
1281          1.0            0.0     594.0                     2435   

                                               cellpath celllabel  \
4636  flowsheet|Flowsheet Cell Labels|I&O|Outpu

In [28]:
# extract Urine data from intakeOutput.csv
Urine = df_UrineOutput[['patientunitstayid', 'intakeoutputoffset', 'UrineOutput']].copy()
Urine.sort_values(by=['patientunitstayid', 'intakeoutputoffset'], inplace=True)

value_position_dict = {}
first_occurrences = []
for idx, value in enumerate(Urine['patientunitstayid']):
    # if the value is not in the dictionary, add it and create index
    if value not in value_position_dict:
        value_position_dict[value] = idx
        first_occurrences.append(idx)

first_occurrences.append(len(Urine))
# create first occurrence index for every patient
Urine_index = pd.Series(first_occurrences)
print(f'First 5 rows of Urine_index: \n{Urine_index.head()}')

First 5 rows of Urine_index: 
0     0
1     1
2     3
3     7
4    15
dtype: int64


**Example: how to use Urine & Urine_index**

In [21]:
# if we want the i th patient's data (i starts from 0)
# use Urine.iloc[Urine_index[i]:Urine_index[i+1]]
i = 1
print(f'Urine Output data for patient {i+1}: \n{Urine.iloc[Urine_index[i]:Urine_index[i+10]]}')

HeartRate data for patient 2: 
      patientunitstayid  intakeoutputoffset  UrineOutput
5263             151179                6297          0.0
5696             151179                6522          0.0
5248             151900                1115        850.0
1281             151900                2435          1.0
971              151900                2555          1.0
...                 ...                 ...          ...
5358             172678                1846        400.0
5557             172678                1945        350.0
4375             172678                2358        600.0
4101             172678                2538        400.0
267              172678                2658        250.0

[80 rows x 3 columns]
