# Create training and test sets
1. Load the dataset with metadata and a separate dataset with clinical features
2. Select a subset of patients that have an associated CXR study
3. Fill in missing values
4. Create train/test splits for 4 different classification strategies

___
# EMR data

In [1]:
import numpy as np
import pandas as pd
import utils
from sklearn.model_selection import train_test_split

**Load datasets**

In [2]:
df = pd.read_csv("../data/mimic-ft98.csv")
print(df.shape)
df.head()

(12652, 105)


Unnamed: 0,stay_id,starttime,endtime,admission_location,insurance,language,ethnicity,marital_status,gender,age,...,sodium_score,albumin_score,bilirubin_score,glucose_score,acidbase_score,gcs_score,duration,log_duration,over72h,alive96h
0,30000670,2182-04-14 07:45:00,2182-04-15 10:00:00,EMERGENCY ROOM,Medicare,ENGLISH,BLACK/AFRICAN AMERICAN,DIVORCED,M,69,...,0.0,,,0.0,12.0,0.0,26.25,3.267666,0,1
1,30000974,2119-06-21 19:30:00,2119-07-07 13:10:00,EMERGENCY ROOM,Medicare,ENGLISH,WHITE,SINGLE,F,92,...,0.0,6.0,0.0,0.0,12.0,13.0,377.666667,5.934012,1,1
2,30001939,2151-04-06 16:55:00,2151-04-15 15:40:00,EMERGENCY ROOM,Medicaid,ENGLISH,WHITE,SINGLE,M,47,...,2.0,11.0,0.0,0.0,12.0,48.0,214.75,5.369475,1,1
3,30002055,2171-09-26 14:28:00,2171-09-29 08:55:00,WALK-IN/SELF REFERRAL,Medicare,ENGLISH,BLACK/AFRICAN AMERICAN,MARRIED,M,69,...,,,,3.0,2.0,0.0,66.45,4.19645,0,1
4,30003299,2169-08-22 01:51:00,2169-08-28 12:02:00,EMERGENCY ROOM,Other,ENGLISH,WHITE,SINGLE,M,26,...,0.0,,,0.0,12.0,15.0,154.183333,5.038142,1,1


**Cluster by severity**

In [3]:
df, pca = utils.cluster_by_severity(df)

Using 24 severity scores...
Fitting PCA...
2    4350
1    4297
4    2084
3    1921
Name: cluster, dtype: int64


**Only XLarge hospitals**

In [None]:
df = df[df.numbedscategory=="XL"]
df.shape

**Single hospitals**

In [None]:
hospid = df.hospitalid.value_counts().head(3).index

In [None]:
df = df[df.hospitalid == hospid[2]]
df.shape

**Complete cases only**

In [None]:
df.info()

In [None]:
df.isna().sum(axis=0).sort_values(ascending=False).head(10)

In [None]:
# df.drop(columns=['co2_total_max', 'co2_total_min', 'co2_total_avg'], inplace=True)
# df.drop(columns=['albumin_score'], inplace=True)

In [None]:
df.dropna(axis=0, how="any", inplace=True)
df.shape

In [None]:
df.log_duration.hist();

In [None]:
df.over72h.value_counts()

**Extract records with associated CXR data**

In [None]:
df_meta = pd.read_csv("../data/mimic-metadata.csv")
print(df_meta.shape)
df_meta.head()

In [None]:
# df = df[df.stay_id.isin(df_meta[df_meta.study_id.notna()].stay_id)].copy()
# df.head()

### Create a train/test split for EMR data

<img src="../results/class distribution.jpeg" alt="Groups" style="width: 400px;"/>

In [4]:
def get_train_test_split(df, strata):   
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df[strata])
    print("The training set contains %d records." % df_train.shape[0])
    print()
    print("Proportion of patients intubated over 72h:")
    print(df_train.over72h.value_counts() / df_train.shape[0])
    print()
    try:
        print("Proportion of patients alive after 96h:")
        print(df_train.alive96h.value_counts() / df_train.shape[0])
        print()
    except:
        print()
    
    print("The test set contains %d records." % df_test.shape[0])
    print()
    print("Proportion of patients intubated over 72h:")
    print(df_test.over72h.value_counts() / df_test.shape[0])
    print()
    try:
        print("Proportion of patients alive after 96h:")
        print(df_test.alive96h.value_counts() / df_test.shape[0])
        print()
    except:
        print()
    return df_train, df_test

In [5]:
def split_data_S0(df, name):
    
    """Classification strategy 0: simply split by prolonged ventilation"""
    
    print("Strategy 0")
    print()
    df_train, df_test = get_train_test_split(df, "over72h")
    print("%d samples in the final training set. "% df_train.shape[0])
    print("%d samples in the final test set. "% df_test.shape[0])
    print()

    df_train.to_csv("../data/" + name + "-S0-train.csv", index=False)
    df_test.to_csv("../data/" + name + "-S0-test.csv", index=False)
    
def split_data_S1(df, name):
    
    """Classification strategy 1: exclude A and C, predict B vs D"""
    
    print("Strategy 1")
    print()
    df_train, df_test = get_train_test_split(df[df.alive96h == 1], "over72h")
    df_test = pd.concat([df_test, df[df.alive96h == 0]], axis=0)
    print("%d samples in the final training set. "% df_train.shape[0])
    print("%d samples in the final test set. "% df_test.shape[0])
    print()

    df_train.to_csv("../data/" + name + "-S1-train.csv", index=False)
    df_test.to_csv("../data/" + name + "-S1-test.csv", index=False)


def split_data_S2(df, name):
    
    """Classification strategy 2: exclude A, predict B vs C+D"""
    
    print("Strategy 2")
    print()
    df["strata"] = df.over72h.astype(str) + df.alive96h.astype(str)
    df_train, df_test = get_train_test_split(df[df.strata != "00"], "strata")
    df_test = pd.concat([df_test, df[df.strata == "00"]], axis=0)
    print("%d samples in the final training set. "% df_train.shape[0])
    print("%d samples in the final test set. "% df_test.shape[0])
    print()
    
    df_train.drop(columns='strata').to_csv("../data/" + name + "-S2-train.csv", index=False)
    df_test.drop(columns='strata').to_csv("../data/" + name + "-S2-test.csv", index=False)
    
    
def split_data_S3(df, name):
    
    """Classification strategy 3: add group A to "bad outcome", predict B vs A+C+D"""
    
    print("Strategy 3")
    print()
    df["strata"] = df.over72h.astype(str) + df.alive96h.astype(str)
    df["good_outcome"] = np.where(df.strata == "01", 1, 0)
    df_train, df_test = get_train_test_split(df, "good_outcome")
    print("%d samples in the final training set. "% df_train.shape[0])
    print("%d samples in the final test set. "% df_test.shape[0])
    print()
    
    df_train.drop(columns='strata').to_csv("../data/" + name + "-S3-train.csv", index=False)
    df_test.drop(columns='strata').to_csv("../data/" + name + "-S3-test.csv", index=False)
    

def split_data_S4(df, name):
    
    """
    Classification strategy 4: two-label classification
    - Predict proonged ventilation: A+B vs C+D
    - Predict survival: A+C vs B+D
    """
    
    print("Strategy 4")
    print()
    df["strata"] = df.over72h.astype(str) + df.alive96h.astype(str)
    df_train, df_test = get_train_test_split(df, "strata")
    print("%d samples in the final training set. "% df_train.shape[0])
    print("%d samples in the final test set. "% df_test.shape[0])
    print()
    
    df_train.drop(columns=['strata', 'good_outcome']).to_csv("../data/" + name + "-S4-train.csv", index=False)
    df_test.drop(columns=['strata', 'good_outcome']).to_csv("../data/" + name + "-S4-test.csv", index=False)

**Split data**

In [6]:
split_data_S0(df, name="mimic-ft98-clustered")

Strategy 0

The training set contains 10121 records.

Proportion of patients intubated over 72h:
1    0.519711
0    0.480289
Name: over72h, dtype: float64

Proportion of patients alive after 96h:
1    0.908803
0    0.091197
Name: alive96h, dtype: float64

The test set contains 2531 records.

Proportion of patients intubated over 72h:
1    0.519953
0    0.480047
Name: over72h, dtype: float64

Proportion of patients alive after 96h:
1    0.917819
0    0.082181
Name: alive96h, dtype: float64

10121 samples in the final training set. 
2531 samples in the final test set. 



In [None]:
split_data_S1(df, name="mimic-emr")

In [None]:
split_data_S2(df, name="mimic-emr")

In [None]:
split_data_S3(df, name="mimic-emr")

In [None]:
split_data_S4(df, name="mimic-emr")

___
### Imputation

In [None]:
df_train = pd.read_csv("../data/eicu-ft46-S0-train.csv")
df_train.drop(columns=["starttime", "endtime"], inplace=True)

label = "log_duration"

print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_csv("../data/eicu-ft46-S0-test.csv")
df_test.drop(columns=["starttime", "endtime"], inplace=True)

label = "log_duration"

print(df_test.shape)
df_test.head()

In [None]:
features = ['ph_max', 'spo2_min',
       'heart_rate_min', 'heart_rate_max', 'resp_rate_min', 'resp_rate_max',
       'temp_min', 'temp_max', 'glucose_max', 'glucose_min', 'co2_total_max',
       'co2_total_min', 'mbp_max', 'mbp_ni_min', 'apsiii', 'peep_max',
       'peep_min', 'co2_total_avg', 'fio2_min', 'plateau_pressure_max',
       'height', 'peep_avg', 'temp_avg', 'hr_score', 'mbp_score', 'temp_score',
       'resp_rate_score', 'pao2_aado2_score', 'hematocrit_score', 'wbc_score',
       'creatinine_score', 'uo_score', 'bun_score', 'sodium_score',
       'albumin_score', 'bilirubin_score', 'glucose_score', 'acidbase_score',
       'gcs_score', 'SOFA', 'respiration', 'coagulation', 'liver',
       'cardiovascular', 'cns', 'renal']

In [None]:
X_train, y_train = utils.get_X_and_y(df_train, features=features, label=label)
print(X_train.shape, y_train.shape)

X_test, y_test = utils.get_X_and_y(df_test, features=features, label=label)
print(X_test.shape, y_test.shape)

preprocessor = utils.define_preprocessor(X_train.columns)

In [None]:
X_train_imp = preprocessor.transformers[0][1]['imputer'].fit_transform(X_train)
X_test_imp = preprocessor.transformers[0][1]['imputer'].transform(X_test)

In [None]:
pd.concat([df_train[["stay_id", "over72h", "duration", "log_duration"]], 
                          pd.DataFrame(X_train_imp, columns=features)], 
                         axis=1).to_csv("../data/eicu-ft46-S0-train-imputed.csv", index=False)

pd.concat([df_test[["stay_id", "over72h", "duration", "log_duration"]], 
                          pd.DataFrame(X_test_imp, columns=features)], 
                         axis=1).to_csv("../data/eicu-ft46-S0-test-imputed.csv", index=False)

___
## Text data

**Load data**

In [None]:
df = pd.read_csv("../data/cxr-reports/mimic_cxr_sections.csv", header=None, names=["study_id", "text"])
print(df.shape)
df.head()

In [None]:
df.study_id = df.study_id.apply(lambda x: int(x.split("s")[1]))

In [None]:
assert df.shape[0] == df_meta[df_meta.study_id.notna()].shape[0]

**Merge with labels**

In [None]:
df = df.merge(df_meta[["over72h", "alive96h", "study_id"]], left_on="study_id", right_on="study_id")

**Split data**

In [None]:
split_data_S1(df, name="reports")

In [None]:
split_data_S2(df, name="reports")

In [None]:
split_data_S3(df, name="reports")

In [None]:
split_data_S4(df, name="reports")