# Data Preprocessing

In [101]:
# load library
import numpy as np
import pandas as pd
import zipfile
import urllib.request
import io
import os
import warnings
import sys
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn.feature_selection import f_classif
warnings.filterwarnings('ignore')

In [102]:
# load raw data
url = "https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/ehr_preprocessed_seq_by_day_cat_embedding.pkl.zip"
with urllib.request.urlopen(url) as response:
    with zipfile.ZipFile(io.BytesIO(response.read())) as zip_file:
        with zip_file.open("ehr_preprocessed_seq_by_day_cat_embedding.pkl", "r") as file:
            EHR = pd.read_pickle(file)
train = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/train.csv")
valid = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/valid.csv")
test = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/test.csv")
print(train.shape)
print(test.shape)
print(valid.shape)
print(len(EHR['feature_cols']))

(55941, 13)
(17933, 12)
(13598, 13)
171


In [103]:
# drop image-related columns
train = train.drop(train.columns[[6,7,8,11]], axis=1)
valid = valid.drop(valid.columns[[6,7,8,11]], axis=1)
test = test.drop(test.columns[[6,7,8,11]], axis=1)
train.head(5)

Unnamed: 0,id,subject_id,hadm_id,admittime,dischtime,deathtime,StudyDate,StudyTime,readmitted_within_30days
0,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410828,90151.343,0
1,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410831,130627.031,0
2,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410821,80228.937,0
3,17910612_22301530,17910612,22301530,2188-03-04 19:49:00,2188-04-19 00:00:00,2188-04-19 00:00:00,21880405,200636.062,1
4,17910612_22301530,17910612,22301530,2188-03-04 19:49:00,2188-04-19 00:00:00,2188-04-19 00:00:00,21880321,114135.046,1


In [104]:
# extract last-day observations
train['StudyDate'] = pd.to_datetime(train['StudyDate'], format='%Y%m%d')
train_latest_date_idxs = train.groupby('id')['StudyDate'].idxmax()
train_masked = train.loc[train_latest_date_idxs]

valid['StudyDate'] = pd.to_datetime(valid['StudyDate'], format='%Y%m%d')
valid_latest_date_idxs = valid.groupby('id')['StudyDate'].idxmax()
valid_masked = valid.loc[valid_latest_date_idxs]

test['StudyDate'] = pd.to_datetime(test['StudyDate'], format='%Y%m%d')
test_latest_date_idxs = test.groupby('id')['StudyDate'].idxmax()
test_masked = test.loc[test_latest_date_idxs]

print(train_masked.shape)
print(valid_masked.shape)
print(test_masked.shape)

(9271, 9)
(2325, 9)
(2936, 8)


In [105]:
# merge train and valid
train_valid = pd.concat([train, valid])
train_valid_masked = pd.concat([train_masked, valid_masked])
print(train_valid.shape)
print(train_valid_masked.shape)

(69539, 9)
(11596, 9)


In [106]:
# add two new predictors (length of stay + previous number of admissions)
# length of stay
test_masked['dischtime'] = pd.to_datetime(test_masked['dischtime'])
test_masked['admittime'] = pd.to_datetime(test_masked['admittime'])
test_masked['stay_len'] = (test_masked['dischtime'] - test_masked['admittime']).dt.days
train_valid_masked['dischtime'] = pd.to_datetime(train_valid_masked['dischtime'])
train_valid_masked['admittime'] = pd.to_datetime(train_valid_masked['admittime'])
train_valid_masked['stay_len'] = (train_valid_masked['dischtime'] - train_valid_masked['admittime']).dt.days

# previous number of admissions
test_masked = test_masked.sort_values('admittime')
test_masked['prev_admits'] = test_masked.groupby('subject_id').cumcount()
train_valid_masked = train_valid_masked.sort_values('admittime')
train_valid_masked['prev_admits'] = train_valid_masked.groupby('subject_id').cumcount()

train_masked.head(5)

Unnamed: 0,id,subject_id,hadm_id,admittime,dischtime,deathtime,StudyDate,StudyTime,readmitted_within_30days
31255,10001884_26184834,10001884,26184834,2131-01-07 20:39:00,2131-01-20 05:15:00,2131-01-20 05:15:00,2131-01-15,44509.078,1
41168,10003019_20962108,10003019,20962108,2176-01-06 15:52:00,2176-01-14 18:09:00,,2176-01-13,92032.812,0
40192,10003400_20214994,10003400,20214994,2137-02-24 10:00:00,2137-03-19 15:45:00,,2137-03-09,15654.89,0
37758,10003400_23559586,10003400,23559586,2137-08-04 00:07:00,2137-09-02 17:05:00,2137-09-02 17:05:00,2137-08-27,34728.062,1
2047,10004235_24181354,10004235,24181354,2196-02-24 14:38:00,2196-03-04 14:02:00,,2196-02-29,53356.765,0


In [107]:
# construct merged dataframes (admit info + features)
# train_valid set
train_valid_merged_admits_all = []
train_valid_merged_admits_latest = []
train_valid_merged_admits_median = []
for _, admit in train_valid_masked.iterrows():
    # sequence data
    num_feat_arrs = len(EHR['feat_dict'][admit['id']])
    for i in range(num_feat_arrs):
        feat_arr = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
        merged = pd.concat([admit, feat_arr])
        merged['obs_id'] = i + 1  
        train_valid_merged_admits_all.append(merged)
    # selected data
    feat_arr_latest = pd.Series(EHR['feat_dict'][admit['id']][-1], index=EHR['feature_cols'])
    feat_arr_median = round(pd.Series(np.nanmedian(EHR['feat_dict'][admit['id']], axis=0), index=EHR['feature_cols']))
    train_valid_merged_admits_latest.append(pd.concat([admit, feat_arr_latest]))
    train_valid_merged_admits_median.append(pd.concat([admit, feat_arr_median]))
train_valid_merged_df_all = pd.DataFrame(train_valid_merged_admits_all)
train_valid_merged_df_latest = pd.DataFrame(train_valid_merged_admits_latest)
train_valid_merged_df_median = pd.DataFrame(train_valid_merged_admits_median)

# test set
test_merged_admits_all = []
test_merged_admits_latest = []
test_merged_admits_median = []
for _, admit in test_masked.iterrows():
    # sequence data
    num_feat_arrs = len(EHR['feat_dict'][admit['id']])
    for i in range(num_feat_arrs):
        feat_arr = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
        merged = pd.concat([admit, feat_arr])
        merged['obs_id'] = i + 1  
        test_merged_admits_all.append(merged)
    # selected data
    feat_arr_latest = pd.Series(EHR['feat_dict'][admit['id']][-1], index=EHR['feature_cols'])
    feat_arr_median = round(pd.Series(np.nanmedian(EHR['feat_dict'][admit['id']], axis=0), index=EHR['feature_cols']))
    test_merged_admits_latest.append(pd.concat([admit, feat_arr_latest]))
    test_merged_admits_median.append(pd.concat([admit, feat_arr_median]))
test_merged_df_all = pd.DataFrame(test_merged_admits_all)
test_merged_df_latest = pd.DataFrame(test_merged_admits_latest)
test_merged_df_median = pd.DataFrame(test_merged_admits_median)

In [108]:
# standardization for numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

cat_cols = [EHR['feature_cols'][i] for i in EHR['cat_idxs']]
icd_cols = EHR['icd_cols']
num_cols = list(set(EHR['feature_cols']) - set(cat_cols) - set(icd_cols)) 
num_cols.extend(['stay_len', 'prev_admits']) 

train_valid_merged_df_all[num_cols] = scaler.fit_transform(train_valid_merged_df_all[num_cols])
train_valid_merged_df_median[num_cols] = scaler.fit_transform(train_valid_merged_df_median[num_cols])
train_valid_merged_df_latest[num_cols] = scaler.fit_transform(train_valid_merged_df_latest[num_cols])
test_merged_df_all[num_cols] = scaler.fit_transform(test_merged_df_all[num_cols])
test_merged_df_median[num_cols] = scaler.fit_transform(test_merged_df_median[num_cols])
test_merged_df_latest[num_cols] = scaler.fit_transform(test_merged_df_latest[num_cols])

In [109]:
# drop features with constant values in the training set
train_valid_merged_df_all.insert(2, 'obs_id', train_valid_merged_df_all.pop('obs_id'))
test_merged_df_all.insert(2, 'obs_id', test_merged_df_all.pop('obs_id'))
feat_cols_all = train_valid_merged_df_all.columns[10:].to_list()
feat_cols_latest = train_valid_merged_df_latest.columns[9:].to_list()
feat_cols_median = train_valid_merged_df_median.columns[9:].to_list()
const_cols_all = [col for col in feat_cols_all if train_valid_merged_df_all[col].nunique() <= 1]
const_cols_latest = [col for col in feat_cols_latest if train_valid_merged_df_latest[col].nunique() <= 1]
const_cols_median = [col for col in feat_cols_median if train_valid_merged_df_median[col].nunique() <= 1]
print(const_cols_all)
print(const_cols_latest)
print(const_cols_median)

train_valid_merged_df_all = train_valid_merged_df_all.drop(columns=const_cols_all, axis=1)
test_merged_df_all = test_merged_df_all.drop(columns=const_cols_all, axis=1)
train_valid_merged_df_latest = train_valid_merged_df_latest.drop(columns=const_cols_latest, axis=1)
test_merged_df_latest = test_merged_df_latest.drop(columns=const_cols_latest, axis=1)
train_valid_merged_df_median = train_valid_merged_df_median.drop(columns=const_cols_median, axis=1)
test_merged_df_median = test_merged_df_median.drop(columns=const_cols_median, axis=1)

print(train_valid_merged_df_all.shape)
print(train_valid_merged_df_latest.shape)
print(train_valid_merged_df_median.shape)

['A20-A28', 'J00-J06']
['A20-A28', 'J00-J06', 'Basophils Other Body Fluid', 'Basophils Pleural', 'Eosinophils Joint Fluid', 'pH Urine', 'Basophils Joint Fluid', 'Basophils Ascites', 'Lymphocytes Joint Fluid']
['A20-A28', 'J00-J06', 'Basophils Other Body Fluid', 'Basophils Pleural', 'Eosinophils Joint Fluid', 'Eosinophils Ascites', 'pH Urine', 'Lymphocytes Ascites', 'Eosinophils Other Body Fluid', 'Basophils Joint Fluid', 'H Blood', 'Basophils Ascites', 'Lymphocytes Other Body Fluid', 'Monocytes Ascites', 'Lymphocytes Joint Fluid']
(173154, 181)
(11596, 173)
(11596, 167)


In [110]:
# make copies
#train_valid_merged_df_all_copy = train_valid_merged_df_all.copy()
#train_valid_merged_df_median_copy = train_valid_merged_df_median.copy()
#train_valid_merged_df_latest_copy = train_valid_merged_df_latest.copy()

In [111]:
# drop info columns
#train_valid_merged_df_all = train_valid_merged_df_all.drop(train_valid_merged_df_all.columns[0:9], axis=1)
#train_valid_merged_df_latest = train_valid_merged_df_latest.drop(train_valid_merged_df_latest.columns[0:8], axis=1)
#train_valid_merged_df_median = train_valid_merged_df_median.drop(train_valid_merged_df_median.columns[0:8], axis=1)

In [112]:
# feature selection with anova F-test
#datasets = [('train_valid_merged_df_all', train_valid_merged_df_all), 
#            ('train_valid_merged_df_median', train_valid_merged_df_median), 
#            ('train_valid_merged_df_latest', train_valid_merged_df_latest)]
#
#for dataset_name, dataset in datasets:
#    X = dataset.drop('readmitted_within_30days', axis=1)
#    y = dataset['readmitted_within_30days']
#    f_score_sums = dict.fromkeys(X.columns, 0.0)
#    kf = KFold(n_splits=10, shuffle=True, random_state=1)
#    for train_index, test_index in kf.split(X):
#        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
#        f_values, p_values = f_classif(X_train, y_train)
#        for i, feature in enumerate(X.columns):
#            f_score_sums[feature] += f_values[i]
#    f_score_avgs = {feature: f_score_sum / 10 for feature, f_score_sum in f_score_sums.items()}
#    top_features = sorted(f_score_avgs, key=f_score_avgs.get, reverse=True)[:100]
#    setattr(sys.modules[__name__], dataset_name.replace('merged', 'selected'), dataset[['readmitted_within_30days'] + top_features])
        

In [113]:
# feature selection with correlation test
# 1. drop features with correlation (with response) lower than 0.1 (5/10 fold correlation test)
# 2. if multiple features have correlation > 0.7 between each other, only keep the one with the highest correlation with response

#train_merged_df_all
#STEP1
# feature selection
# 1. drop features with correlation (with response) lower than 0.1 (5/10 fold correlation test)
# 2. if multiple features have correlation > 0.7 between each other, only keep the one with the highest correlation with response

#train_merged_df_all
#STEP1

#k = 10
#cv = KFold(n_splits=k, shuffle=True)
#selected_features = []
#for train_index, _ in cv.split(train_merged_df_all):
#    fold_data = train_merged_df_all.iloc[train_index]
#    numeric_data = fold_data.select_dtypes(include=np.number) 
#    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
#    selected_features.append(correlations)
#compute average correlation
#selected_features_df = pd.concat(selected_features, axis=1)
#average_correlation = selected_features_df.mean(axis=1)
#drop variables with correlation with response lower than 0.01
#selected_columns = average_correlation[average_correlation < 0.001].index
#train_selected_df_all = train_merged_df_all.drop(selected_columns, axis=1)
#STEP2
#corr_matrix = train_selected_df_all.corr()
#cols = corr_matrix.columns
#drop variables with correlation with correlation between each other>0.7 except the highest one
#for i in range(len(cols)):
#    for j in range(i+1, len(cols)):
#        if abs(corr_matrix.iloc[i,j]) > 0.7:
#            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
#                train_selected_df_all = train_selected_df_all.drop(cols[j], axis=1)
#            else:
#                train_selected_df_all = train_selected_df_all.drop(cols[i], axis=1)
#                break
#print(train_selected_df_all.shape)

#train_merged_df_latest
#STEP1
#k = 10
#cv = KFold(n_splits=k, shuffle=True)
#selected_features = []
#for train_index, _ in cv.split(train_merged_df_latest):
#    fold_data = train_merged_df_latest.iloc[train_index]
#    numeric_data = fold_data.select_dtypes(include=np.number) 
#    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
#    selected_features.append(correlations)
#selected_features_df = pd.concat(selected_features, axis=1)
#average_correlation = selected_features_df.mean(axis=1)
#selected_columns = average_correlation[average_correlation < 0.001].index
#train_selected_df_latest = train_merged_df_latest.drop(selected_columns, axis=1)
#STEP2
#corr_matrix = train_selected_df_latest.corr()
#cols = corr_matrix.columns
#for i in range(len(cols)):
#    for j in range(i+1, len(cols)):
#        if abs(corr_matrix.iloc[i,j]) > 0.7:
#            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
#                train_selected_df_latest = train_selected_df_latest.drop(cols[j], axis=1)
#            else:
#                train_selected_df_latest = train_selected_df_latest.drop(cols[i], axis=1)
#                break
#print(train_selected_df_latest.shape)

#train_merged_df_median
#STEP1
#k = 10
#cv = KFold(n_splits=k, shuffle=True)
#selected_features = []
#for train_index, _ in cv.split(train_merged_df_median):
#    fold_data = train_merged_df_median.iloc[train_index]
#    numeric_data = fold_data.select_dtypes(include=np.number) 
#    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
#    selected_features.append(correlations)
#selected_features_df = pd.concat(selected_features, axis=1)
#average_correlation = selected_features_df.mean(axis=1)
#selected_columns = average_correlation[average_correlation < 0.001].index
#train_selected_df_median = train_merged_df_median.drop(selected_columns, axis=1)
#STEP2
#corr_matrix = train_selected_df_median.corr()
#cols = corr_matrix.columns
#for i in range(len(cols)):
#    for j in range(i+1, len(cols)):
#        if abs(corr_matrix.iloc[i,j]) > 0.7:
#            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
#                train_selected_df_median = train_selected_df_median.drop(cols[j], axis=1)
#            else:
#                train_selected_df_median = train_selected_df_median.drop(cols[i], axis=1)
#                break
#print(train_selected_df_median.shape)

In [114]:
#train_valid_merged_df_all
#STEP1
#k = 10
#cv = KFold(n_splits=k, shuffle=True)
#selected_features = []
#for train_valid_index, _ in cv.split(train_valid_merged_df_all):
#    fold_data = train_valid_merged_df_all.iloc[train_valid_index]
#    numeric_data = fold_data.select_dtypes(include=np.number) 
#    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
#    selected_features.append(correlations)
#selected_features_df = pd.concat(selected_features, axis=1)
#average_correlation = selected_features_df.mean(axis=1)
#selected_columns = average_correlation[average_correlation < 0.001].index
#train_valid_selected_df_all = train_valid_merged_df_all.drop(selected_columns, axis=1)
#STEP2
#corr_matrix = train_valid_selected_df_all.corr()
#cols = corr_matrix.columns
#for i in range(len(cols)):
#    for j in range(i+1, len(cols)):
#        if abs(corr_matrix.iloc[i,j]) > 0.7:
#            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
#                train_valid_selected_df_all = train_valid_selected_df_all.drop(cols[j], axis=1)
#            else:
#                train_valid_selected_df_all = train_valid_selected_df_all.drop(cols[i], axis=1)
#                break
#print(train_valid_selected_df_all.shape)

#train_valid_merged_df_latest
#STEP1
#k = 10
#cv = KFold(n_splits=k)
#selected_features = []
#for train_valid_index, _ in cv.split(train_valid_merged_df_latest):
#    fold_data = train_valid_merged_df_latest.iloc[train_valid_index]
#    numeric_data = fold_data.select_dtypes(include=np.number) 
#    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
#    selected_features.append(correlations)
#selected_features_df = pd.concat(selected_features, axis=1)
#average_correlation = selected_features_df.mean(axis=1)
#selected_columns = average_correlation[average_correlation < 0.001].index
#train_valid_selected_df_latest = train_valid_merged_df_latest.drop(selected_columns, axis=1)
#STEP2
#corr_matrix = train_valid_selected_df_latest.corr()
#cols = corr_matrix.columns
#for i in range(len(cols)):
#    for j in range(i+1, len(cols)):
#        if abs(corr_matrix.iloc[i,j]) > 0.7:
#            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
#                train_valid_selected_df_latest = train_valid_selected_df_latest.drop(cols[j], axis=1)
#            else:
#                train_valid_selected_df_latest = train_valid_selected_df_latest.drop(cols[i], axis=1)
#                break
#print(train_valid_selected_df_latest.shape)

#train_valid_merged_df_median
#STEP1
#k = 10
#cv = KFold(n_splits=k)
#selected_features = []
#for train_valid_index, _ in cv.split(train_valid_merged_df_median):
#    fold_data = train_valid_merged_df_median.iloc[train_valid_index]
#    numeric_data = fold_data.select_dtypes(include=np.number) 
#    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
#    selected_features.append(correlations)
#selected_features_df = pd.concat(selected_features, axis=1)
#average_correlation = selected_features_df.mean(axis=1)
#selected_columns = average_correlation[average_correlation < 0.001].index
#train_valid_selected_df_median = train_valid_merged_df_median.drop(selected_columns, axis=1)
#STEP2
#corr_matrix = train_valid_selected_df_median.corr()
#cols = corr_matrix.columns
#for i in range(len(cols)):
#    for j in range(i+1, len(cols)):
#        if abs(corr_matrix.iloc[i,j]) > 0.7:
#            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
#                train_valid_selected_df_median = train_valid_selected_df_median.drop(cols[j], axis=1)
#            else:
#                train_valid_selected_df_median = train_valid_selected_df_median.drop(cols[i], axis=1)
#                break
#print(train_valid_selected_df_median.shape)

In [115]:
# write csv files
# train set
train_valid_merged_df_latest.to_csv('train/train_valid_latest.csv', index=False)
train_valid_merged_df_median.to_csv('train/train_valid_median.csv', index=False)
train_valid_merged_df_all.to_csv('train/train_valid_all.csv', index=False)
with zipfile.ZipFile('train/sequence_data.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('train/train_valid_all.csv')
os.remove('train/train_valid_all.csv')

# test set
test_merged_df_latest.to_csv('test/test_latest.csv', index=False)
test_merged_df_median.to_csv('test/test_median.csv', index=False)
test_merged_df_all.to_csv('test/test_all.csv', index=False)