<a href="https://colab.research.google.com/github/zxyao5148/STAT3612_2023_1A_GroupProject/blob/main/RNN_Transformer/Data_prerocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **STAT3612 Group Project** ##
Hospital readmission prediction based on patients' Electronic Health Record (EHR) data from MIMIC-IV v1.0

In [148]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [149]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [165]:
# Import data for predicting readmission

EHR = pd.read_pickle('/content/drive/MyDrive/3612/ehr_preprocessed_seq_by_day_cat_embedding.pkl')
test = pd.read_csv('/content/drive/MyDrive/3612/test.csv')
train_original = pd.read_csv('/content/drive/MyDrive/3612/train.csv')
valid_original = pd.read_csv('/content/drive/MyDrive/3612/valid.csv')

print(test.shape)
print(train_original.shape)
print(valid_original.shape)

(17933, 12)
(55941, 13)
(13598, 13)


In [166]:
train = pd.concat([train_original,valid_original], ignore_index=True)
train.shape

(69539, 13)

In [167]:
train['deathtime'] = np.where(train['deathtime'].notnull(), 1, 0)
test['deathtime'] = np.where(test['deathtime'].notnull(), 1, 0)

In [168]:
# Calculate the number of repeated occurrences for each 'id'
id_counts = train['id'].value_counts()

# Create a new column 'times of study' to assign the values to the corresponding id
train['times of study'] = train['id'].map(id_counts)

train['StudyDate'] = pd.to_datetime(train['StudyDate'], format='%Y%m%d')
train_latest_date_idxs = train.groupby('id')['StudyDate'].idxmax()
train=train.loc[train_latest_date_idxs]

train["admittime"] = pd.to_datetime(train["admittime"])
train["dischtime"] = pd.to_datetime(train["dischtime"])
# Sort the DataFrame by "subject_id" and "admittime"
train.sort_values(by=["subject_id", "admittime"], inplace=True)

# Calculate the number of repeated occurrences for each 'id'
id_counts = test['id'].value_counts()

# Create a new column 'times of study' to assign the values to the corresponding id
test['times of study'] = test['id'].map(id_counts)

test['StudyDate'] = pd.to_datetime(test['StudyDate'], format='%Y%m%d')
test_latest_date_idxs = test.groupby('id')['StudyDate'].idxmax()
test=test.loc[test_latest_date_idxs]

test["admittime"] = pd.to_datetime(test["admittime"])
test["dischtime"] = pd.to_datetime(test["dischtime"])
# Sort the DataFrame by "subject_id" and "admittime"
test.sort_values(by=["subject_id", "admittime"], inplace=True)

In [169]:
# Calculate the time differences for each "subject_id"
train["admittime_diff"] = train.groupby("subject_id")["admittime"].diff().dt.total_seconds()
train["dischtime_diff"] = train.groupby("subject_id")["dischtime"].diff().dt.total_seconds()

# Calculate the mean time difference for each "subject_id"
mean_admittime_diff = train.groupby("subject_id")["admittime_diff"].mean()
mean_dischtime_diff = train.groupby("subject_id")["dischtime_diff"].mean()

# Create a new column to assign the means to the corresponding "subject_id"
train["mean_admittime_diff"] = train["subject_id"].map(mean_admittime_diff)
train["mean_dischtime_diff"] = train["subject_id"].map(mean_dischtime_diff)

# Calculate the time differences for each "subject_id"
test["admittime_diff"] = test.groupby("subject_id")["admittime"].diff().dt.total_seconds()
test["dischtime_diff"] = test.groupby("subject_id")["dischtime"].diff().dt.total_seconds()

# Calculate the mean time difference for each "subject_id"
mean_admittime_diff = test.groupby("subject_id")["admittime_diff"].mean()
mean_dischtime_diff = test.groupby("subject_id")["dischtime_diff"].mean()

# Create a new column to assign the means to the corresponding "subject_id"
test["mean_admittime_diff"] = test["subject_id"].map(mean_admittime_diff)
test["mean_dischtime_diff"] = test["subject_id"].map(mean_dischtime_diff)

In [170]:
# add two new predictors (length of stay + previous number of admissions)
# length of stay
test_masked['dischtime'] = pd.to_datetime(test_masked['dischtime'])
test_masked['admittime'] = pd.to_datetime(test_masked['admittime'])
test_masked['stay_len'] = (test_masked['dischtime'] - test_masked['admittime']).dt.days
train_masked['dischtime'] = pd.to_datetime(train_masked['dischtime'])
train_masked['admittime'] = pd.to_datetime(train_masked['admittime'])
train_masked['stay_len'] = (train_masked['dischtime'] - train_masked['admittime']).dt.days

# previous number of admissions
test_masked = test_masked.sort_values('admittime')
test_masked['prev_admits'] = test_masked.groupby('subject_id').cumcount()
train_masked = train_masked.sort_values('admittime')
train_masked['prev_admits'] = train_masked.groupby('subject_id').cumcount()

In [171]:
train = train.drop(train.columns[11], axis=1)
train = train.drop(train.columns[6:10], axis=1)

test = test.drop(test.columns[11], axis=1)
test = test.drop(test.columns[6:10], axis=1)

In [174]:
df = pd.DataFrame(columns=['id','number of studies'] + EHR['feature_cols'])
df

Unnamed: 0,id,number of studies,age,gender,ethnicity,Y90-Y99,G30-G32,O85-O92,C60-C63,F40-F48,...,PRE-NATAL VITAMINS,ANESTHETICS,ANTIBIOTICS,ANTIHYPERGLYCEMICS,ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS,SEDATIVE/HYPNOTICS,ANTIDOTES,AUTONOMIC DRUGS,VITAMINS,BIOLOGICALS


In [175]:
for i in EHR["feat_dict"].keys():
  # add id columns
  row = pd.DataFrame({'id': [i],'number of studies':EHR["feat_dict"][i].shape[0]})
  # add EHR data
  # each subject and each admission could have multiple lines of data during the admission for different times of study
  arr=EHR["feat_dict"][i]
  median_values = np.median(arr, axis=0)
  reshaped_array = median_values.reshape(1, -1)
  data = pd.DataFrame(reshaped_array, columns=EHR['feature_cols'])
  row = pd.concat([row, data], axis=1)
  # Concatenate the original DataFrame with the new row DataFrame
  df = pd.concat([df, row], ignore_index=True)

In [176]:
df.shape

(14532, 173)

In [179]:
common_key = 'id'

# Merge the additional information from df to the train DataFrame
df_train = pd.merge(train, df, on=common_key, how='left')

# Merge the additional information from df to the test DataFrame
df_test = pd.merge(test, df, on=common_key, how='left')


In [185]:
print(train.shape)
print(test.shape)

(11596, 13)
(2936, 12)


In [186]:
print(df_train.shape)
print(df_test.shape)

(11596, 185)
(2936, 184)


In [201]:
# Calculate the correlation matrix
correlation_matrix = df_train.corr()

# Select the correlation of features with the target variable
target_correlation = correlation_matrix['readmitted_within_30days']

# Set the threshold for high correlation
threshold = 0.1  # You can adjust this threshold as needed

# Filter the features with high correlation
highly_correlated_features = target_correlation[abs(target_correlation) > threshold].index.tolist()

# Remove "readmitted_within_30days" from the list of highly correlated features
highly_correlated_features = [feature for feature in highly_correlated_features if feature != 'readmitted_within_30days']

# Print the names of highly correlated features
print(highly_correlated_features)

  correlation_matrix = df_train.corr()


['deathtime', 'StudyTime', 'times of study', 'mean_admittime_diff', 'mean_dischtime_diff', 'Creatinine Blood', 'Sodium Blood', 'pO2 Blood', 'Lactate Blood', 'Anion Gap Blood', 'Chloride Blood', 'Calcium, Total Blood', 'Bicarbonate Blood', 'Glucose Blood', 'pH Blood', 'Platelet Count Blood', 'pCO2 Blood', 'ELECT/CALORIC/H2O', 'ANTIBIOTICS', 'AUTONOMIC DRUGS', 'VITAMINS']


In [None]:
# corr > 0.7: death time
# corr > 0.3: pO2 Blood
# corr > 0.2: 'mean_admittime_diff', 'mean_dischtime_diff', 'Lactate Blood', 'pH Blood', 'pCO2 Blood'
# corr > 0.1: 'StudyTime', 'times of study', 'Creatinine Blood', 'Sodium Blood', 'Anion Gap Blood',
#         'Chloride Blood', 'Calcium, Total Blood', 'Bicarbonate Blood', 'Glucose Blood',
#          'Platelet Count Blood', 'ELECT/CALORIC/H2O', 'ANTIBIOTICS', 'AUTONOMIC DRUGS', 'VITAMINS'

In [189]:
df_train.to_csv('train_valid.csv', index=False)
df_test.to_csv('test.csv', index=False)

from google.colab import files
files.download('train_valid.csv')
files.download('test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>