In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.impute import KNNImputer
pd.options.mode.chained_assignment = None

In [3]:
baseline_df = pd.read_csv("data/data_by_table/baseline.csv")
vitalsign_df = pd.read_csv("data/data_by_table/mimiciv_derived_vitalsign.csv")
labevents_df = pd.read_csv("data/data_by_table/mimiciv_hosp_labevents.csv")
cohort_subject_id_stay_id_df = pd.read_csv("data/data_by_table/cohort_subject_id_stay_id.csv")
ground_truth_df = pd.read_csv("data/data_by_table/ground_truth.csv")
ventilator_setting_df = pd.read_csv("data/paper_data/ven_setting.csv")
GCS_df = pd.read_csv("data/paper_data/GCS.csv")
anion_df = pd.read_csv("data/paper_data/Anion.csv")
urine_df = pd.read_csv("data/paper_data/urine_output.csv")
label_df = pd.read_csv("data/data_by_table/ground_truth.csv")
GCS_df = GCS_df.drop(columns='charttime_1')

In [6]:
def generate_df_template(stay_id, endtime):
    end_time = pd.to_datetime(endtime).floor('H')
    time_intervals = [end_time - timedelta(hours=i) for i in range(24)]
    df = pd.DataFrame(time_intervals, columns=['charttime'])
    df['stay_id'] = stay_id 
    return df

In [7]:
def generate_all_template(ground_truth_df):
    data_template = pd.DataFrame()
    for index, row in ground_truth_df.iterrows():
        data_now = generate_df_template(row['stay_id'],row['endtime'])
        data_template = pd.concat([data_template, data_now], ignore_index=False)
    return data_template

In [8]:
def fill_na(df):
    df['charttime'] = pd.to_datetime(df['charttime'])
    df = df.sort_values(by=['stay_id', 'charttime'])
    df = df.set_index('charttime')
    df_resampled = df.groupby('stay_id').resample('H').max()
    for col in df.columns:
        if(col != 'stay_id' and col != 'subject_id' and col != 'charttime'):
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='ffill')
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='bfill')
    df_resampled = df_resampled.drop(columns='stay_id')
    df_resampled = df_resampled.reset_index()
    for col in df.columns:
        if col == 'subject_id':
            df_resampled = df_resampled.drop(columns='subject_id')
        if col == 'hadm_id':
            df_resampled = df_resampled.drop(columns='hadm_id')
    return df_resampled

In [15]:
def min_agg(series):
    non_nan_values = series.dropna()
    if len(non_nan_values) == 0:
        return np.nan  # No values available, keep it as NaN
    else:
        return non_nan_values.min()  # Choose the minimum value among available values
def max_agg(series):
    non_nan_values = series.dropna()
    if len(non_nan_values) == 0:
        return np.nan  # No values available, keep it as NaN
    else:
        return non_nan_values.max()  # Choose the maximum value among available values
def HR_agg(series): # this rule is by Dr.
    # Rule 1: If all records are greater than or equal to 80, return the max
    if all(value >= 80 for value in series):
        return series.max()
    # Rule 2: If all records are less than 80, return the min
    elif all(value < 80 for value in series):
        return series.min()
    # Rule 3: Otherwise, return the min
    else:
        return series.min()
def RR_agg(series): # this rule is by Dr.
    # Rule 1: If all records are greater than or equal to 12, return the max
    if all(value >= 12 for value in series):
        return series.max()
    # Rule 2: If all records are less than 12, return the min
    elif all(value < 12 for value in series):
        return series.min()
    # Rule 3: Otherwise, return the min
    else:
        return series.min()

def fill_and_merge(data_all, df_target, ground_truth_df):
    template = generate_all_template(ground_truth_df)
    df_target = fill_na(df_target)
    df = pd.merge(df_target, template, how='outer', on=['stay_id','charttime'])
    df['charttime'] = pd.to_datetime(df['charttime'])
    df = df.sort_values(by=['stay_id', 'charttime'])
    df = df.set_index('charttime')
    df_resampled = df.groupby('stay_id').resample('H').max()
    for col in df.columns:
        if(col != 'stay_id' and col != 'subject_id' and col != 'charttime'):
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='ffill')
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='bfill')
    df_resampled = df_resampled.drop(columns='stay_id')
    df_resampled = df_resampled.reset_index()
    df = pd.merge(data_all,df_resampled, how='inner', on=['stay_id','charttime'])
    return df
    

In [16]:
def check_missing_values(df):
    if df.isna().any().any():
        return 1 
    else:
        return 0 

def add_label_id(df,stay_id, flag, r_v, dod):
    selected_data = df[df['stay_id'] == stay_id]
    selected_data['label'] = flag
    if np.isnan(r_v):
        selected_data['Rev_h'] = -1000
    else:
         selected_data['Rev_h'] = r_v
    if np.isnan(dod):
        selected_data['dod_h'] = -1000
    else:
         selected_data['dod_h'] = dod
    selected_data = selected_data.sort_values(by=['stay_id', 'charttime'])
    return selected_data


def add_label(df, df_label):
    cancate_data = pd.DataFrame()
    for index, row in df_label.iterrows():
        data_now = add_label_id(df,row['stay_id'],row['label'], row['re_vent_time_diff'], row['weaning_till_dod_hr'])
        cancate_data = pd.concat([cancate_data, data_now], ignore_index=False)
    return cancate_data

In [17]:
data_template = generate_all_template(ground_truth_df)
ventilator_setting_df_24 = fill_and_merge(data_template, ventilator_setting_df, ground_truth_df)
print("ventilator_setting_df_24 finish")
labevents_df_24 = fill_and_merge(data_template, labevents_df, ground_truth_df)
print("labevents_df_24 finish")
vitalsign_df_24 = fill_and_merge(data_template, vitalsign_df, ground_truth_df)
print("vitalsign_df_24 finish")
GCS_df_24 = fill_and_merge(data_template, GCS_df, ground_truth_df)
print("GCS_df_24 finish")
anion_df_24 = fill_and_merge(data_template, anion_df, ground_truth_df)
print("anion_df_24 finish")

  df_resampled = df.groupby('stay_id').resample('H').max()


ventilator_setting_df_24 finish
labevents_df_24 finish
vitalsign_df_24 finish
GCS_df_24 finish
anion_df_24 finish


In [18]:
imputer = KNNImputer(n_neighbors=2)
baseline_df_p = baseline_df 
baseline_df_p[['height_cm', 'weight_kg']] = imputer.fit_transform(baseline_df_p[['height_cm', 'weight_kg']])
baseline_df_p['height_cm'].fillna(baseline_df_p.groupby('gender')['height_cm'].transform('mean'), inplace=True)
baseline_df_p['weight_kg'].fillna(baseline_df_p.groupby('gender')['weight_kg'].transform('mean'), inplace=True)
baseline_df_p = baseline_df_p.drop(columns=['subject_id', 'hadm_id'])

In [24]:
merged_df = pd.merge(labevents_df_24, vitalsign_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, labevents_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, ventilator_setting_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, GCS_df_24, on=['stay_id', 'charttime'], how='inner')
#merged_df = pd.merge(merged_df, anion_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, baseline_df, on=['stay_id'], how='inner')
merged_df = merged_df.drop(columns=['subject_id', 'hadm_id', 'ventilator_type','O2_flow_x', 'O2_flow_y'])
final_data = add_label(merged_df, ground_truth_df)
final_data['RSBI'] = final_data['resp_rate'] / (final_data['tidal_volume_observed'] * 0.001)
final_data['minute_ventilation'] = final_data['resp_rate'] * (final_data['tidal_volume_observed'] * 0.001)

In [26]:
print(final_data.columns)

Index(['charttime', 'stay_id', 'heart_rate', 'sbp', 'dbp', 'mbp', 'resp_rate',
       'spo2', 'peep', 'fio2', 'tidal_volume_observed', 'respiratory_rate_set',
       'plateau_pressure', 'GCS', 'age_now', 'gender', 'insurance', 'race',
       'admission_type', 'first_careunit', 'weight_kg', 'height_cm', 'tobacco',
       'label', 'Rev_h', 'dod_h', 'RSBI', 'minute_ventilation'],
      dtype='object')
