In [125]:
import pandas as pd


## Initial exploration of Patient data

### TODO
- choose appropriate features
- potentially consolidate different values (ethnicity only white not white - russian, white - other european ...)

In [126]:
DATA_PATH = "../MIMIC-III/precomputed/all_hourly_data.h5"

data_stats = pd.read_hdf(DATA_PATH, 'patients')
data_stats.index = data_stats.index.droplevel(['hadm_id', 'icustay_id']) # only one ID is necessary

### Features to use:
- age
- gender
- ethnicity -- out
- insurance
- admission_type
- first_careunit


### Maybe add in:
- los_icu - good here but only known after stay has ended --> not good for prediction
- fullcode, dnr, cmo
- admittime, dischtime --> convert to los maybe?
- intime (into icu)
- diagnosis at admission --> 11.000 different levels might be to many --> maybe consolidate




In [127]:
keepcols = ['gender', 'ethnicity', 'age', 'insurance', 'admission_type', 'first_careunit']

In [128]:
data_stats = data_stats[keepcols]

In [129]:
data_stats.head()

Unnamed: 0_level_0,gender,ethnicity,age,insurance,admission_type,first_careunit
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,M,WHITE,76.526792,Medicare,EMERGENCY,MICU
4,F,WHITE,47.845047,Private,EMERGENCY,MICU
6,F,WHITE,65.942297,Medicare,ELECTIVE,SICU
9,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,EMERGENCY,MICU
11,F,WHITE,50.148295,Private,EMERGENCY,SICU


## Some features need to be tweaked to work
### Ethnicity
- Combine different subgroups into more broad categories


In [130]:
data_stats['ethnicity'].value_counts()


WHITE                                                       24429
UNKNOWN/NOT SPECIFIED                                        3221
BLACK/AFRICAN AMERICAN                                       2456
HISPANIC OR LATINO                                            881
OTHER                                                         785
UNABLE TO OBTAIN                                              652
ASIAN                                                         545
PATIENT DECLINED TO ANSWER                                    351
ASIAN - CHINESE                                               166
HISPANIC/LATINO - PUERTO RICAN                                124
BLACK/CAPE VERDEAN                                            122
WHITE - RUSSIAN                                                99
MULTI RACE ETHNICITY                                           77
BLACK/HAITIAN                                                  64
WHITE - OTHER EUROPEAN                                         59
HISPANIC/L

In [131]:
# Crude consolidation function adapted from here:
# https://github.com/MLforHealth/MIMIC_Extract/blob/master/notebooks/Summary%20Stats.ipynb (MIMIC Extract Github)


def categorize_ethnicity(ethnicity):
    if 'ASIAN' in ethnicity or 'MIDDLE EASTERN' in ethnicity:
        ethnicity = 'ASIAN'
    elif 'WHITE' in ethnicity or 'PORTUGUESE' in ethnicity:
        ethnicity = 'WHITE'
    elif 'HISPANIC' in ethnicity or 'SOUTH AMERICAN' in ethnicity:
        ethnicity = 'HISPANIC/LATINO'
    elif 'BLACK' in ethnicity:
        ethnicity = 'BLACK'
    elif 'OTHER' in ethnicity or 'ISLAND' in ethnicity or 'MULTI' in ethnicity or 'AMERICAN INDIAN' in ethnicity:
        ethnicity = 'OTHER'
    else: 
        ethnicity = 'MISSING'
    return ethnicity

In [132]:
data_stats.loc[:,'ethnicity'] = data_stats['ethnicity'].apply(categorize_ethnicity)

  data_stats.loc[:,'ethnicity'] = data_stats['ethnicity'].apply(categorize_ethnicity)


In [133]:
data_stats['ethnicity'].value_counts()

WHITE              24675
MISSING             4224
BLACK               2667
HISPANIC/LATINO     1144
OTHER                897
ASIAN                865
Name: ethnicity, dtype: int64

### Age
- combine age groups into buckets (ages above 89 are set to ~300 to protect subjects)
- range 15 - 310 
- new groups:
  - 90+
  - 70-89
  - 50-69
  - 30-49
  - 10-29

In [134]:
data_stats['age'].describe()

count    34472.000000
mean        75.029209
std         55.853725
min         15.052693
25%         52.466861
50%         65.741463
75%         78.016920
max        310.280861
Name: age, dtype: float64

In [135]:
def categorize_age(age):
    if age < 30: 
        cat = '<30'
    elif age < 50:
        cat = '30-49'
    elif age < 70:
        cat = '50-69'
    elif age < 90:
        cat = '70-89'
    else: 
        cat = '>90'
    return cat

In [136]:
data_stats.loc[:,'age'] = data_stats['age'].apply(categorize_age)

In [137]:
data_stats['age'].value_counts()

50-69    12938
70-89    12377
30-49     5489
>90       1836
<30       1832
Name: age, dtype: int64

In [138]:
data_stats.value_counts()

gender  ethnicity  age    insurance   admission_type  first_careunit
F       WHITE      70-89  Medicare    EMERGENCY       MICU              1395
M       WHITE      70-89  Medicare    EMERGENCY       MICU              1295
                                                      CCU                749
                   50-69  Private     EMERGENCY       MICU               718
F       WHITE      70-89  Medicare    EMERGENCY       CCU                658
                                                                        ... 
M       ASIAN      30-49  Medicare    ELECTIVE        MICU                 1
F       ASIAN      30-49  Government  ELECTIVE        SICU                 1
M       ASIAN      50-69  Government  ELECTIVE        CCU                  1
                                                      SICU                 1
        WHITE      >90    Self Pay    EMERGENCY       TSICU                1
Length: 1639, dtype: int64