# stretch_eicu (sepsis-pics)


## common


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from common_eicu import *


## apache patient result


In [2]:
df_apache = pd.read_csv(
    eicu_path('apachePatientResult.csv.gz'),
)
df_apache.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297064 entries, 0 to 297063
Data columns (total 23 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   apachepatientresultsid         297064 non-null  int64  
 1   patientunitstayid              297064 non-null  int64  
 2   physicianspeciality            297064 non-null  object 
 3   physicianinterventioncategory  297064 non-null  object 
 4   acutephysiologyscore           297064 non-null  int64  
 5   apachescore                    297064 non-null  int64  
 6   apacheversion                  297064 non-null  object 
 7   predictedicumortality          297064 non-null  float64
 8   actualicumortality             297064 non-null  object 
 9   predictediculos                297064 non-null  float64
 10  actualiculos                   297064 non-null  float64
 11  predictedhospitalmortality     297064 non-null  float64
 12  actualhospitalmortality       

In [7]:
df_apache['apacheversion'].unique()


array(['IV', 'IVa'], dtype=object)

In [8]:
df_apache[['patientunitstayid', 'apacheversion']] \
    .groupby('patientunitstayid') \
    .count()['apacheversion'] \
    .unique()


array([2], dtype=int64)

In [9]:
df_apache.head(8)


Unnamed: 0,apachepatientresultsid,patientunitstayid,physicianspeciality,physicianinterventioncategory,acutephysiologyscore,apachescore,apacheversion,predictedicumortality,actualicumortality,predictediculos,...,predictedhospitallos,actualhospitallos,preopmi,preopcardiaccath,ptcawithin24h,unabridgedunitlos,unabridgedhosplos,actualventdays,predventdays,unabridgedactualventdays
0,26570,141168,critical care medicine (CCM),Unknown,49,65,IV,0.026988,EXPIRED,3.038388,...,7.546453,2.4972,0,0,0,2.4972,2.4972,,,
1,26571,141168,critical care medicine (CCM),Unknown,49,65,IVa,0.028889,EXPIRED,3.091127,...,6.62872,2.4972,0,0,0,2.4972,2.4972,,,
2,53135,141194,critical care medicine (CCM),Unknown,57,70,IV,0.037888,ALIVE,4.620982,...,13.338449,9.2167,0,0,0,3.3423,9.2167,,,
3,53136,141194,critical care medicine (CCM),Unknown,57,70,IVa,0.046448,ALIVE,4.167129,...,12.978228,9.2167,0,0,0,3.3423,9.2167,,,
4,8,141203,hospitalist,I,73,90,IVa,0.291609,ALIVE,8.670299,...,16.319389,3.7493,0,0,0,1.2979,3.7493,2.0,5.738093,2.0
5,7,141203,hospitalist,I,73,90,IV,0.319783,ALIVE,7.487287,...,14.509362,3.7493,0,0,0,1.2979,3.7493,2.0,5.05381,2.0
6,53140,141208,hospitalist,I,17,17,IVa,0.00133,ALIVE,0.444563,...,0.521041,0.4215,0,0,0,0.5,0.4215,,,
7,53139,141208,hospitalist,I,17,17,IV,0.001035,ALIVE,0.188817,...,0.57973,0.4215,0,0,0,0.5,0.4215,,,


## patient data


In [2]:
df_patient = pd.read_csv(
    eicu_path('patient.csv.gz'),
)
df_patient.columns


Index(['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age',
       'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx',
       'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',
       'uniquepid'],
      dtype='object')

In [6]:
df_patient['admissionheight'].describe()


count    196644.000000
mean        169.247166
std          13.690182
min           0.000000
25%         162.500000
50%         170.000000
75%         177.800000
max         612.600000
Name: admissionheight, dtype: float64

## diagnosis data


In [15]:
df_sepsis = pd.read_csv(
    './data/sepsis_eicu.csv.gz',
)
df_sepsis.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115375 entries, 0 to 115374
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   patientunitstayid  115375 non-null  int64 
 1   diagnosisstring    115375 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [13]:
len(df_sepsis[KEY_IDENTITY].unique())


23479

## treatment data


In [18]:
df_treatment = pd.read_csv(
    TREATMENT_PATH,
    nrows=5000,
)
df_treatment.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   treatmentid          5000 non-null   int64 
 1   patientunitstayid    5000 non-null   int64 
 2   treatmentoffset      5000 non-null   int64 
 3   treatmentstring      5000 non-null   object
 4   activeupondischarge  5000 non-null   bool  
dtypes: bool(1), int64(3), object(1)
memory usage: 161.3+ KB


In [6]:
df_treatment[
    (df_treatment['patientunitstayid'] == 242290)
    & (
        df_treatment['treatmentstring'].map(
            lambda s: 'vasopressor' in s
        )
    )
]


Unnamed: 0,treatmentid,patientunitstayid,treatmentoffset,treatmentstring,activeupondischarge
245,8645242,242290,162,cardiovascular|shock|vasopressors|norepinephri...,False
253,10019133,242290,165,cardiovascular|shock|vasopressors|norepinephri...,True


In [23]:
with open('./data/treatment_strings.txt', 'w') as output_file:
    for string in df_treatment[KEY_TREATMENT_STRING].unique():
        output_file.write(string)
        output_file.write('\n')


## exam data


In [9]:
df_exam = pd.read_csv(
    # './data/exam_eicu_processed.csv.gz',
    EXAM_PATH,
    nrows=100_000,
)
df_exam.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   physicalexamid      100000 non-null  int64 
 1   patientunitstayid   100000 non-null  int64 
 2   physicalexamoffset  100000 non-null  int64 
 3   physicalexampath    100000 non-null  object
 4   physicalexamvalue   100000 non-null  object
 5   physicalexamtext    100000 non-null  object
dtypes: int64(3), object(3)
memory usage: 4.6+ MB


In [12]:
df_exam[
    df_exam[KEY_EXAM_NAME] == (
        'notes/Progress Notes/Physical Exam/Physical Exam'
        '/Neurologic/GCS/10'
    )
]


Unnamed: 0,physicalexamid,patientunitstayid,physicalexamoffset,physicalexampath,physicalexamvalue,physicalexamtext
10344,5351815,228672,21,notes/Progress Notes/Physical Exam/Physical Ex...,10,10
40987,5636870,181047,31,notes/Progress Notes/Physical Exam/Physical Ex...,10,10
69285,5908334,192574,64,notes/Progress Notes/Physical Exam/Physical Ex...,10,10
90873,6126450,152074,59,notes/Progress Notes/Physical Exam/Physical Ex...,10,10


In [34]:
with open('./data/exam_names.txt', 'w') as output_file:
    # for name in df_exam[KEY_EXAM_NAME].unique():
    #     output_file.write(name)
    #     output_file.write('\n')
    item_columns = ['physicalexampath', 'physicalexamvalue']
    raw_items = df_exam[item_columns].values
    items = np.array([
        f'{path} :: {value}'
        for path, value in raw_items
    ])
    unique_items = np.unique(items)
    for item in unique_items:
        output_file.write(item)
        output_file.write('\n')


## lab data


In [15]:
df_lab = pd.read_csv(
    # './data/lab_eicu_processed.csv.gz',
    LAB_PATH,
    nrows=50000,
)
df_lab.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   labid                    50000 non-null  int64  
 1   patientunitstayid        50000 non-null  int64  
 2   labresultoffset          50000 non-null  int64  
 3   labtypeid                50000 non-null  int64  
 4   labname                  50000 non-null  object 
 5   labresult                49781 non-null  float64
 6   labresulttext            50000 non-null  object 
 7   labmeasurenamesystem     47371 non-null  object 
 8   labmeasurenameinterface  49255 non-null  object 
 9   labresultrevisedoffset   50000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 3.8+ MB


In [15]:
# df_lab[df_lab['patientunitstayid'] == 242505][['patientunitstayid','labname']] \
#     .groupby('labname').count()
# df_lab[
#     (df_lab['patientunitstayid'] == 242505)
#     & (df_lab['labname'].isin(['albumin', '-lymphs']))
# ].sort_values(by='labresultoffset')
# df_lab[
#     df_lab['labname'].map(
#         lambda s: 'wbc' in s.lower()
#     )
# ]['labname'].unique()
df_lab[df_lab['labname'] == 'WBC x 1000'].head()


Unnamed: 0,labid,patientunitstayid,labresultoffset,labtypeid,labname,labresult,labresulttext,labmeasurenamesystem,labmeasurenameinterface,labresultrevisedoffset
52,54358175,141168,2026,3,WBC x 1000,19.8,19.8,K/mcL,K/mcL,2148
118,56155662,141168,516,3,WBC x 1000,9.8,9.8,K/mcL,K/mcL,524
135,53275575,141168,1133,3,WBC x 1000,14.7,14.7,K/mcL,K/mcL,1196
148,53200536,141178,-280,3,WBC x 1000,7.6,7.6,K/mcL,K/mcL,-216
173,55330219,141179,1487,3,WBC x 1000,8.1,8.1,K/mcL,K/mcL,1546


In [23]:
with open('./data/lab_names.txt', 'w') as output_file:
    for name in df_lab[KEY_LAB_NAME].unique():
        measure = df_lab[df_lab[KEY_LAB_NAME] == name].iat[0, 7]
        output_file.write(name)
        output_file.write(' (')
        output_file.write(
            measure if measure == measure else '?'
        )
        output_file.write(')')
        output_file.write('\n')


## aperiodic


In [22]:
df_aperiodic = pd.read_csv(
    eicu_path('vitalAperiodic.csv.gz'),
    nrows=10000,
)
df_aperiodic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   vitalaperiodicid      10000 non-null  int64  
 1   patientunitstayid     10000 non-null  int64  
 2   observationoffset     10000 non-null  int64  
 3   noninvasivesystolic   9971 non-null   float64
 4   noninvasivediastolic  9971 non-null   float64
 5   noninvasivemean       9999 non-null   float64
 6   paop                  1 non-null      float64
 7   cardiacoutput         0 non-null      float64
 8   cardiacinput          0 non-null      float64
 9   svr                   0 non-null      float64
 10  svri                  0 non-null      float64
 11  pvr                   0 non-null      float64
 12  pvri                  0 non-null      float64
dtypes: float64(10), int64(3)
memory usage: 1015.8 KB


In [15]:
len(df_aperiodic[KEY_IDENTITY].unique())


133

## periodic


In [35]:
df_periodic = pd.read_csv(
    eicu_path('vitalPeriodic.csv.gz'),
    nrows=10000,
)
df_periodic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   vitalperiodicid    10000 non-null  int64  
 1   patientunitstayid  10000 non-null  int64  
 2   observationoffset  10000 non-null  int64  
 3   temperature        265 non-null    float64
 4   sao2               8075 non-null   float64
 5   heartrate          9995 non-null   float64
 6   respiration        7834 non-null   float64
 7   cvp                3584 non-null   float64
 8   etco2              0 non-null      float64
 9   systemicsystolic   4253 non-null   float64
 10  systemicdiastolic  4252 non-null   float64
 11  systemicmean       4307 non-null   float64
 12  pasystolic         2355 non-null   float64
 13  padiastolic        2355 non-null   float64
 14  pamean             2355 non-null   float64
 15  st1                3622 non-null   float64
 16  st2                3680

## infusion drug


In [13]:
df_infusion = pd.read_csv(
    './data/infusion_eicu_processed.csv.gz',
    # INFUSION_PATH,
)
df_infusion.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2881 entries, 0 to 2880
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientunitstayid  2881 non-null   int64  
 1   infusionoffset     2881 non-null   int64  
 2   drugname           2881 non-null   object 
 3   drugamount         2881 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 90.2+ KB


In [4]:
with open('./data/infusion_drug_names.txt', 'w') as output_file:
    for name in df_infusion[KEY_INFUSION_NAME].unique():
        output_file.write(str(name))
        output_file.write('\n')


In [14]:
len(df_infusion[KEY_IDENTITY].unique())


1693

In [16]:
sum(
    int(identity in df_sepsis[KEY_IDENTITY])
    for identity in df_infusion[KEY_IDENTITY].unique()
)


0

## extracted data (full)


In [2]:
df_full = pd.read_csv('./data/data_eicu_full.csv.gz')
df_full.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100308 entries, 0 to 100307
Data columns (total 59 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   patientunitstayid   100308 non-null  int64  
 1   gender              100308 non-null  object 
 2   age                 100308 non-null  float64
 3   ethnicity           100308 non-null  object 
 4   Apache-IV           100308 non-null  float64
 5   offset              100308 non-null  int64  
 6   vasopressor         100308 non-null  int64  
 7   heparin             100308 non-null  int64  
 8   urine               100308 non-null  float64
 9   PEEP                100308 non-null  float64
 10  creatinine          100308 non-null  float64
 11  platelet            100308 non-null  float64
 12  INR                 100308 non-null  float64
 13  PT                  100308 non-null  float64
 14  PTT                 100308 non-null  float64
 15  lactate             100308 non-nul

In [3]:
df_full_non_categorical = df_full.drop(
    columns=[KEY_IDENTITY, *CATEGORICAL_COLUMNS]
)
negative_mask = df_full_non_categorical > 9999
df_full_non_categorical[
    negative_mask.any(axis='columns')
]


Unnamed: 0,age,Apache-IV,offset,urine,PEEP,creatinine,platelet,INR,PT,PTT,...,temperature,RBC transfusion,FFP transfusion,PLT transfusion,BMI,indirect bilirubin,PaO2/FiO2,SpO2/FiO2,ROX index,flag
516,84.0,83.000000,9,1182.860945,6.141111,2.870,66.0,1.400000,14.600000,37.000000,...,39.006611,0.0,0.0,0.0,29.605976,-0.100000,inf,inf,inf,0
691,29.0,71.000000,3,1182.860945,6.141111,1.460,145.0,1.000000,10.200000,24.000000,...,37.311806,0.0,0.0,0.0,22.162400,1.132214,inf,inf,inf,0
715,45.0,65.000000,1,1182.860945,6.141111,1.290,234.0,1.485258,16.906363,38.658497,...,39.006611,0.0,0.0,0.0,43.047441,-0.673274,inf,inf,inf,0
2205,64.0,70.896397,11,1182.860945,5.000000,1.205,38.2,1.625000,17.075000,30.000000,...,36.350357,0.0,0.0,0.0,31.215340,5.950000,2.065,2.427604,0.077465,0
4678,68.0,76.000000,2,1182.860945,6.141111,0.860,668.0,1.400000,15.100000,29.000000,...,39.006611,0.0,0.0,0.0,30.336148,-0.573274,inf,inf,inf,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99867,73.0,93.000000,17,150.000000,6.141111,2.890,43.0,1.670000,16.600000,33.000000,...,39.006611,0.0,0.0,0.0,26.758431,0.700000,4.600,3.331944,0.170245,0
99869,73.0,93.000000,19,150.000000,6.141111,3.380,48.0,1.670000,16.600000,33.000000,...,39.006611,0.0,0.0,0.0,26.758431,0.700000,4.600,3.332523,0.153538,0
99870,73.0,93.000000,20,150.000000,6.141111,4.460,74.0,1.670000,16.600000,33.000000,...,39.006611,0.0,0.0,0.0,26.758431,0.700000,4.600,3.332026,0.161227,0
99871,73.0,93.000000,21,150.000000,6.141111,3.360,67.0,1.670000,16.600000,33.000000,...,39.006611,0.0,0.0,0.0,26.758431,0.700000,4.600,3.332026,0.161227,0


In [7]:
s = df_full.nunique() == 1
s[s]

Series([], dtype: bool)

In [8]:
print(
    'patient count: {:,d}'.format(
        len(df_full[KEY_IDENTITY].unique())
    )
)


patient count: 17,729


In [9]:
positive_count = len(
    df_full.query(f'flag == {FLAG_POSITIVE:d}')
)
total_count = len(df_full)
positive_rate = positive_count / total_count
print(f'positive rate: {positive_rate:.2%}')


positive rate: 3.85%
