# Data Preprocessing

In [1]:
# load library
import numpy as np
import pandas as pd
import zipfile
import urllib.request
import io
import os
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GroupKFold

In [2]:
# load raw data
url = "https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/ehr_preprocessed_seq_by_day_cat_embedding.pkl.zip"
with urllib.request.urlopen(url) as response:
    with zipfile.ZipFile(io.BytesIO(response.read())) as zip_file:
        with zip_file.open("ehr_preprocessed_seq_by_day_cat_embedding.pkl", "r") as file:
            EHR = pd.read_pickle(file)
train = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/train.csv")
valid = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/valid.csv")
test = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/test.csv")
print(train.shape)
print(test.shape)
print(valid.shape)
print(len(EHR['feature_cols']))

(55941, 13)
(17933, 12)
(13598, 13)
171


In [3]:
# drop image-related columns
train = train.drop(train.columns[[6,7,8,11]], axis=1)
valid = valid.drop(valid.columns[[6,7,8,11]], axis=1)
test = test.drop(test.columns[[6,7,8,11]], axis=1)
train.head(5)

Unnamed: 0,id,subject_id,hadm_id,admittime,dischtime,deathtime,StudyDate,StudyTime,readmitted_within_30days
0,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410828,90151.343,0
1,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410831,130627.031,0
2,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410821,80228.937,0
3,17910612_22301530,17910612,22301530,2188-03-04 19:49:00,2188-04-19 00:00:00,2188-04-19 00:00:00,21880405,200636.062,1
4,17910612_22301530,17910612,22301530,2188-03-04 19:49:00,2188-04-19 00:00:00,2188-04-19 00:00:00,21880321,114135.046,1


In [4]:
# extract last-day observations
train['StudyDate'] = pd.to_datetime(train['StudyDate'], format='%Y%m%d')
train_latest_date_idxs = train.groupby('id')['StudyDate'].idxmax()
train_masked = train.loc[train_latest_date_idxs]


valid['StudyDate'] = pd.to_datetime(valid['StudyDate'], format='%Y%m%d')
valid_latest_date_idxs = valid.groupby('id')['StudyDate'].idxmax()
valid_masked = valid.loc[valid_latest_date_idxs]

test['StudyDate'] = pd.to_datetime(test['StudyDate'], format='%Y%m%d')
test_latest_date_idxs = test.groupby('id')['StudyDate'].idxmax()
test_masked = test.loc[test_latest_date_idxs]

print(train_masked.shape)
print(valid_masked.shape)
print(test_masked.shape)

(9271, 9)
(2325, 9)
(2936, 8)


In [5]:
# merge train and valid
train = pd.concat([train, valid])
train_masked = pd.concat([train_masked, valid_masked])
print(train.shape)
print(train_masked.shape)

(69539, 9)
(11596, 9)


In [6]:
# add two new predictors (length of stay + previous number of admissions)
# length of stay
train_masked['dischtime'] = pd.to_datetime(train_masked['dischtime'])
train_masked['admittime'] = pd.to_datetime(train_masked['admittime'])
train_masked['LoS'] = (train_masked['dischtime'] - train_masked['admittime']).dt.days
test_masked['dischtime'] = pd.to_datetime(test_masked['dischtime'])
test_masked['admittime'] = pd.to_datetime(test_masked['admittime'])
test_masked['LoS'] = (test_masked['dischtime'] - test_masked['admittime']).dt.days

# previous number of admissions
train_masked = train_masked.sort_values('admittime')
train_masked['prev_admits'] = train_masked.groupby('subject_id').cumcount()
test_masked = test_masked.sort_values('admittime')
test_masked['prev_admits'] = test_masked.groupby('subject_id').cumcount()

train_masked.head(5)

Unnamed: 0,id,subject_id,hadm_id,admittime,dischtime,deathtime,StudyDate,StudyTime,readmitted_within_30days,LoS,prev_admits
41856,17195991_23542772,17195991,23542772,2110-01-11 22:47:00,2110-01-18 10:25:00,,2110-01-16,90654.546,0,6,0
24332,13721591_20342223,13721591,20342223,2110-02-09 18:13:00,2110-02-22 20:51:00,,2110-02-19,41948.468,0,13,0
8330,10582595_20690213,10582595,20690213,2110-02-09 20:41:00,2110-02-17 15:40:00,,2110-02-16,50133.578,0,7,0
3862,14385035_20480421,14385035,20480421,2110-02-23 14:28:00,2110-03-11 17:29:00,,2110-02-28,214324.437,0,16,0
20119,19170541_22178312,19170541,22178312,2110-02-28 21:48:00,2110-03-12 17:47:00,,2110-03-11,81842.812,0,11,0


In [7]:
# construct merged dataframes (admit info + features)
train_merged_admits_all = []
train_merged_admits_latest = []
train_merged_admits_median = []
for _, admit in train_masked.iterrows():
    # sequence data
    num_feat_arrs = len(EHR['feat_dict'][admit['id']])
    for i in range(num_feat_arrs):
        feat_arr = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
        merged = pd.concat([admit, feat_arr])
        merged['obs_id'] = i + 1  
        train_merged_admits_all.append(merged)
    # selected data
    feat_arr_latest = pd.Series(EHR['feat_dict'][admit['id']][-1], index=EHR['feature_cols'])
    feat_arr_median = round(pd.Series(np.nanmedian(EHR['feat_dict'][admit['id']], axis=0), index=EHR['feature_cols']))
    train_merged_admits_latest.append(pd.concat([admit, feat_arr_latest]))
    train_merged_admits_median.append(pd.concat([admit, feat_arr_median]))
train_merged_df_all = pd.DataFrame(train_merged_admits_all)
train_merged_df_latest = pd.DataFrame(train_merged_admits_latest)
train_merged_df_median = pd.DataFrame(train_merged_admits_median)

# test set
test_merged_admits_all = []
test_merged_admits_latest = []
test_merged_admits_median = []
for _, admit in test_masked.iterrows():
    # sequence data
    num_feat_arrs = len(EHR['feat_dict'][admit['id']])
    for i in range(num_feat_arrs):
        feat_arr = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
        merged = pd.concat([admit, feat_arr])
        merged['obs_id'] = i + 1  
        test_merged_admits_all.append(merged)
    # selected data
    feat_arr_latest = pd.Series(EHR['feat_dict'][admit['id']][-1], index=EHR['feature_cols'])
    feat_arr_median = round(pd.Series(np.nanmedian(EHR['feat_dict'][admit['id']], axis=0), index=EHR['feature_cols']))
    test_merged_admits_latest.append(pd.concat([admit, feat_arr_latest]))
    test_merged_admits_median.append(pd.concat([admit, feat_arr_median]))
test_merged_df_all = pd.DataFrame(test_merged_admits_all)
test_merged_df_latest = pd.DataFrame(test_merged_admits_latest)
test_merged_df_median = pd.DataFrame(test_merged_admits_median)

In [8]:
# drop info columns
train_merged_df_all = train_merged_df_all.drop(train_merged_df_all.columns[2:8], axis=1)
train_merged_df_all.insert(2, 'obs_id', train_merged_df_all.pop('obs_id'))
test_merged_df_all = test_merged_df_all.drop(test_merged_df_all.columns[2:8], axis=1)
test_merged_df_all.insert(2, 'obs_id', test_merged_df_all.pop('obs_id'))
train_merged_df_latest = train_merged_df_latest.drop(train_merged_df_latest.columns[2:8], axis=1)
test_merged_df_latest = test_merged_df_latest.drop(test_merged_df_latest.columns[2:8], axis=1)
train_merged_df_median = train_merged_df_median.drop(train_merged_df_median.columns[2:8], axis=1)
test_merged_df_median = test_merged_df_median.drop(test_merged_df_median.columns[2:8], axis=1)

In [9]:
# standardization for numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

cat_cols = [EHR['feature_cols'][i] for i in EHR['cat_idxs']]
icd_cols = EHR['icd_cols']
num_cols = list(set(EHR['feature_cols']) - set(cat_cols) - set(icd_cols)) 
num_cols.extend(['LoS', 'prev_admits']) 

train_merged_df_all[num_cols] = scaler.fit_transform(train_merged_df_all[num_cols])
train_merged_df_median[num_cols] = scaler.fit_transform(train_merged_df_median[num_cols])
train_merged_df_latest[num_cols] = scaler.fit_transform(train_merged_df_latest[num_cols])
test_merged_df_all[num_cols] = scaler.fit_transform(test_merged_df_all[num_cols])
test_merged_df_median[num_cols] = scaler.fit_transform(test_merged_df_median[num_cols])
test_merged_df_latest[num_cols] = scaler.fit_transform(test_merged_df_latest[num_cols])

train_merged_df_latest.head(5)

Unnamed: 0,id,subject_id,readmitted_within_30days,LoS,prev_admits,age,gender,ethnicity,Y90-Y99,G30-G32,...,PRE-NATAL VITAMINS,ANESTHETICS,ANTIBIOTICS,ANTIHYPERGLYCEMICS,ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS,SEDATIVE/HYPNOTICS,ANTIDOTES,AUTONOMIC DRUGS,VITAMINS,BIOLOGICALS
0,17195991_23542772,17195991,0,-0.426682,-0.406394,-0.322013,0,6,0,0,...,-0.015166,-0.46459,-0.455345,-0.324132,-0.009287,-0.428107,-0.047867,2.207636,-0.380706,-0.304657
1,13721591_20342223,13721591,0,-0.025023,-0.406394,-0.075877,0,6,0,0,...,-0.015166,0.215206,-0.131328,-0.23759,-0.009287,-0.428107,-0.047867,-0.305346,-0.380706,-0.304657
2,10582595_20690213,10582595,0,-0.369302,-0.406394,1.524003,0,6,0,0,...,-0.015166,3.61419,0.192689,-0.324132,-0.009287,-0.065138,-0.047867,-0.305346,-0.380706,-0.304657
3,14385035_20480421,14385035,0,0.147116,-0.406394,0.84713,1,6,0,0,...,-0.015166,-0.46459,0.840724,0.022039,-0.009287,-0.428107,-0.047867,-0.305346,-0.380706,1.551782
4,19170541_22178312,19170541,0,-0.139783,-0.406394,-1.368088,1,3,0,0,...,-0.015166,0.215206,-0.293337,-0.324132,-0.009287,-0.428107,-0.047867,-0.305346,-0.380706,-0.304657


In [10]:
# drop features with constant values in the training set
feat_cols_all = train_merged_df_all.columns[4:].to_list()
feat_cols_latest = train_merged_df_latest.columns[3:].to_list()
feat_cols_median = train_merged_df_median.columns[3:].to_list()
const_cols_all = [col for col in feat_cols_all if train_merged_df_all[col].nunique() <= 1]
const_cols_latest = [col for col in feat_cols_latest if train_merged_df_latest[col].nunique() <= 1]
const_cols_median = [col for col in feat_cols_median if train_merged_df_median[col].nunique() <= 1]
print(const_cols_all)
print(const_cols_latest)
print(const_cols_median)

train_merged_df_all = train_merged_df_all.drop(columns=const_cols_all, axis=1)
test_merged_df_all = test_merged_df_all.drop(columns=const_cols_all, axis=1)
train_merged_df_latest = train_merged_df_latest.drop(columns=const_cols_latest, axis=1)
train_merged_df_median = train_merged_df_median.drop(columns=const_cols_median, axis=1)
test_merged_df_latest = test_merged_df_latest.drop(columns=const_cols_latest, axis=1)
test_merged_df_median = test_merged_df_median.drop(columns=const_cols_median, axis=1)

print(train_merged_df_all.shape)
print(train_merged_df_latest.shape)
print(train_merged_df_median.shape)

train_merged_df_median.head()

['A20-A28', 'J00-J06']
['A20-A28', 'J00-J06', 'Basophils Other Body Fluid', 'Basophils Pleural', 'Eosinophils Joint Fluid', 'pH Urine', 'Basophils Joint Fluid', 'Basophils Ascites', 'Lymphocytes Joint Fluid']
['A20-A28', 'J00-J06', 'Basophils Other Body Fluid', 'Basophils Pleural', 'Eosinophils Joint Fluid', 'Eosinophils Ascites', 'pH Urine', 'Lymphocytes Ascites', 'Eosinophils Other Body Fluid', 'Basophils Joint Fluid', 'H Blood', 'Basophils Ascites', 'Lymphocytes Other Body Fluid', 'Monocytes Ascites', 'Lymphocytes Joint Fluid']
(173154, 175)
(11596, 167)
(11596, 161)


Unnamed: 0,id,subject_id,readmitted_within_30days,LoS,prev_admits,age,gender,ethnicity,Y90-Y99,G30-G32,...,PRE-NATAL VITAMINS,ANESTHETICS,ANTIBIOTICS,ANTIHYPERGLYCEMICS,ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS,SEDATIVE/HYPNOTICS,ANTIDOTES,AUTONOMIC DRUGS,VITAMINS,BIOLOGICALS
0,17195991_23542772,17195991,0,-0.426682,-0.406394,-0.322013,0.0,6.0,0.0,0.0,...,-0.015166,-0.46459,-0.455359,-0.324178,-0.009287,-0.428196,-0.047867,2.207636,-0.380706,-0.304657
1,13721591_20342223,13721591,0,-0.025023,-0.406394,-0.075877,0.0,6.0,0.0,0.0,...,-0.015166,0.215206,-0.131342,-0.237635,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,-0.304657
2,10582595_20690213,10582595,0,-0.369302,-0.406394,1.524003,0.0,6.0,0.0,0.0,...,-0.015166,3.61419,0.192675,-0.324178,-0.009287,-0.065231,-0.047867,-0.305346,-0.380706,-0.304657
3,14385035_20480421,14385035,0,0.147116,-0.406394,0.84713,1.0,6.0,0.0,0.0,...,-0.015166,-0.46459,0.840709,0.021994,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,1.551782
4,19170541_22178312,19170541,0,-0.139783,-0.406394,-1.368088,1.0,3.0,0.0,0.0,...,-0.015166,0.215206,-0.29335,-0.324178,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,-0.304657


In [11]:
train_merged_df_median.head()

Unnamed: 0,id,subject_id,readmitted_within_30days,LoS,prev_admits,age,gender,ethnicity,Y90-Y99,G30-G32,...,PRE-NATAL VITAMINS,ANESTHETICS,ANTIBIOTICS,ANTIHYPERGLYCEMICS,ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS,SEDATIVE/HYPNOTICS,ANTIDOTES,AUTONOMIC DRUGS,VITAMINS,BIOLOGICALS
0,17195991_23542772,17195991,0,-0.426682,-0.406394,-0.322013,0.0,6.0,0.0,0.0,...,-0.015166,-0.46459,-0.455359,-0.324178,-0.009287,-0.428196,-0.047867,2.207636,-0.380706,-0.304657
1,13721591_20342223,13721591,0,-0.025023,-0.406394,-0.075877,0.0,6.0,0.0,0.0,...,-0.015166,0.215206,-0.131342,-0.237635,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,-0.304657
2,10582595_20690213,10582595,0,-0.369302,-0.406394,1.524003,0.0,6.0,0.0,0.0,...,-0.015166,3.61419,0.192675,-0.324178,-0.009287,-0.065231,-0.047867,-0.305346,-0.380706,-0.304657
3,14385035_20480421,14385035,0,0.147116,-0.406394,0.84713,1.0,6.0,0.0,0.0,...,-0.015166,-0.46459,0.840709,0.021994,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,1.551782
4,19170541_22178312,19170541,0,-0.139783,-0.406394,-1.368088,1.0,3.0,0.0,0.0,...,-0.015166,0.215206,-0.29335,-0.324178,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,-0.304657


In [12]:
train_merged_df_latest.head()

Unnamed: 0,id,subject_id,readmitted_within_30days,LoS,prev_admits,age,gender,ethnicity,Y90-Y99,G30-G32,...,PRE-NATAL VITAMINS,ANESTHETICS,ANTIBIOTICS,ANTIHYPERGLYCEMICS,ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS,SEDATIVE/HYPNOTICS,ANTIDOTES,AUTONOMIC DRUGS,VITAMINS,BIOLOGICALS
0,17195991_23542772,17195991,0,-0.426682,-0.406394,-0.322013,0,6,0,0,...,-0.015166,-0.46459,-0.455345,-0.324132,-0.009287,-0.428107,-0.047867,2.207636,-0.380706,-0.304657
1,13721591_20342223,13721591,0,-0.025023,-0.406394,-0.075877,0,6,0,0,...,-0.015166,0.215206,-0.131328,-0.23759,-0.009287,-0.428107,-0.047867,-0.305346,-0.380706,-0.304657
2,10582595_20690213,10582595,0,-0.369302,-0.406394,1.524003,0,6,0,0,...,-0.015166,3.61419,0.192689,-0.324132,-0.009287,-0.065138,-0.047867,-0.305346,-0.380706,-0.304657
3,14385035_20480421,14385035,0,0.147116,-0.406394,0.84713,1,6,0,0,...,-0.015166,-0.46459,0.840724,0.022039,-0.009287,-0.428107,-0.047867,-0.305346,-0.380706,1.551782
4,19170541_22178312,19170541,0,-0.139783,-0.406394,-1.368088,1,3,0,0,...,-0.015166,0.215206,-0.293337,-0.324132,-0.009287,-0.428107,-0.047867,-0.305346,-0.380706,-0.304657


In [13]:
train_merged_df_median.head()

Unnamed: 0,id,subject_id,readmitted_within_30days,LoS,prev_admits,age,gender,ethnicity,Y90-Y99,G30-G32,...,PRE-NATAL VITAMINS,ANESTHETICS,ANTIBIOTICS,ANTIHYPERGLYCEMICS,ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS,SEDATIVE/HYPNOTICS,ANTIDOTES,AUTONOMIC DRUGS,VITAMINS,BIOLOGICALS
0,17195991_23542772,17195991,0,-0.426682,-0.406394,-0.322013,0.0,6.0,0.0,0.0,...,-0.015166,-0.46459,-0.455359,-0.324178,-0.009287,-0.428196,-0.047867,2.207636,-0.380706,-0.304657
1,13721591_20342223,13721591,0,-0.025023,-0.406394,-0.075877,0.0,6.0,0.0,0.0,...,-0.015166,0.215206,-0.131342,-0.237635,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,-0.304657
2,10582595_20690213,10582595,0,-0.369302,-0.406394,1.524003,0.0,6.0,0.0,0.0,...,-0.015166,3.61419,0.192675,-0.324178,-0.009287,-0.065231,-0.047867,-0.305346,-0.380706,-0.304657
3,14385035_20480421,14385035,0,0.147116,-0.406394,0.84713,1.0,6.0,0.0,0.0,...,-0.015166,-0.46459,0.840709,0.021994,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,1.551782
4,19170541_22178312,19170541,0,-0.139783,-0.406394,-1.368088,1.0,3.0,0.0,0.0,...,-0.015166,0.215206,-0.29335,-0.324178,-0.009287,-0.428196,-0.047867,-0.305346,-0.380706,-0.304657


In [14]:
# Feature selection
# 1. drop features with correlation (with response) lower than 0.1 (10 group fold correlation test)
# 2. if multiple features have correlation > 0.7 between each other, only keep the one with the highest correlation with response

#train_merged_df_all
#STEP1
k = 10
cv = GroupKFold(n_splits=k)
selected_features = []



for train_index, _ in cv.split(train_merged_df_all, groups=train_merged_df_all['subject_id']):
    fold_data = train_merged_df_all.iloc[train_index]
    numeric_data = fold_data.select_dtypes(include=np.number) 
    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
    selected_features.append(correlations)
    
#Compute average correlation
selected_features_df = pd.concat(selected_features, axis=1)
average_correlation = selected_features_df.mean(axis=1)

#Drop variables with correlation with response lower than 0.1
selected_columns = average_correlation[average_correlation < 0.01].index
train_merged_df_all = train_merged_df_all.drop(selected_columns, axis=1)

#STEP2
corr_matrix = train_merged_df_all.corr()
cols = corr_matrix.columns

# Drop variables with correlation with correlation between each other>0.7 except the highest one
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        if abs(corr_matrix.iloc[i,j]) > 0.7:
            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
                train_merged_df_all = train_merged_df_all.drop(cols[j], axis=1)
            else:
                train_merged_df_all = train_merged_df_all.drop(cols[i], axis=1)
                break


print(train_merged_df_all.shape)

(173154, 51)


In [15]:
#train_merged_df_latest
#STEP1
k = 10
cv = GroupKFold(n_splits=k)
selected_features = []
for train_index, _ in cv.split(train_merged_df_latest, groups=train_merged_df_latest['subject_id']):
    fold_data = train_merged_df_latest.iloc[train_index]
    numeric_data = fold_data.select_dtypes(include=np.number) 
    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
    selected_features.append(correlations)
selected_features_df = pd.concat(selected_features, axis=1)
average_correlation = selected_features_df.mean(axis=1)
selected_columns = average_correlation[average_correlation < 0.01].index
train_merged_df_latest = train_merged_df_latest.drop(selected_columns, axis=1)

#STEP2
corr_matrix = train_merged_df_latest.corr()
cols = corr_matrix.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        if abs(corr_matrix.iloc[i,j]) > 0.7:
            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
                train_merged_df_latest = train_merged_df_latest.drop(cols[j], axis=1)
            else:
                train_merged_df_latest = train_merged_df_latest.drop(cols[i], axis=1)
                break
print(train_merged_df_latest.shape)

(11596, 55)


In [16]:
#train_merged_df_median
#STEP1
k = 10
cv = GroupKFold(n_splits=k)
selected_features = []
for train_index, _ in cv.split(train_merged_df_median, groups=train_merged_df_median['subject_id']):
    fold_data = train_merged_df_median.iloc[train_index]
    numeric_data = fold_data.select_dtypes(include=np.number) 
    correlations = numeric_data.corrwith(fold_data['readmitted_within_30days'])
    selected_features.append(correlations)
selected_features_df = pd.concat(selected_features, axis=1)
average_correlation = selected_features_df.mean(axis=1)
selected_columns = average_correlation[average_correlation < 0.01].index
train_merged_df_median = train_merged_df_median.drop(selected_columns, axis=1)

#STEP2
corr_matrix = train_merged_df_median.corr()
cols = corr_matrix.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        if abs(corr_matrix.iloc[i,j]) > 0.7:
            if abs(corr_matrix.iloc[i,-1]) > abs(corr_matrix.iloc[j,-1]):
                train_merged_df_median = train_merged_df_median.drop(cols[j], axis=1)
            else:
                train_merged_df_median = train_merged_df_median.drop(cols[i], axis=1)
                break
print(train_merged_df_median.shape)

(11596, 52)


In [17]:
# address class imbalance issue (training set)
# oversampling the minority class
# note: pip install -U threadpoolctl to debug if necessary
# resample latest_df
X_latest = train_merged_df_latest.drop(columns=['readmitted_within_30days'])
y_latest = train_merged_df_latest['readmitted_within_30days']
print('Original dataset shape %s' % Counter(y_latest))
smo_latest = SMOTE(random_state=42)
X_latest_smo, y_latest_smo = smo_latest.fit_resample(X_latest, y_latest)
print('Resampled dataset shape %s' % Counter(y_latest_smo))
y_latest_smo = pd.DataFrame(y_latest_smo)
y_latest_smo.columns = ['readmitted_within_30days']
smo_df_latest = pd.concat([X_latest_smo, y_latest_smo], axis=1)
smo_df_latest.to_csv('train_smote_latest_feat.csv', index=False)

# resample median_df
X_median = train_merged_df_median.drop(columns=['readmitted_within_30days'])
y_median = train_merged_df_median['readmitted_within_30days']
print('Original dataset shape %s' % Counter(y_median))
smo_median = SMOTE(random_state=42)
X_median_smo, y_median_smo = smo_median.fit_resample(X_median, y_median)
print('Resampled dataset shape %s' % Counter(y_median_smo))
y_median_smo = pd.DataFrame(y_median_smo)
y_median_smo.columns = ['readmitted_within_30days']
smo_df_median = pd.concat([X_median_smo, y_median_smo], axis=1)
smo_df_median.to_csv('train_smote_median_feat.csv', index=False)

Original dataset shape Counter({0: 9564, 1: 2032})
Resampled dataset shape Counter({0: 9564, 1: 9564})
Original dataset shape Counter({0: 9564, 1: 2032})
Resampled dataset shape Counter({0: 9564, 1: 9564})


In [18]:
# write csv files
train_merged_df_latest.to_csv('train_merged_latest_feat.csv', index=False)
train_merged_df_median.to_csv('train_merged_median_feat.csv', index=False)
test_merged_df_latest.to_csv('test_merged_latest_feat.csv', index=False)
test_merged_df_median.to_csv('test_merged_median_feat.csv', index=False)

train_merged_df_all.to_csv('train_merged_all_feat.csv', index=False)
test_merged_df_all.to_csv('test_merged_all_feat.csv', index=False)
with zipfile.ZipFile('sequence_data.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('train_merged_all_feat.csv')
    zipf.write('test_merged_all_feat.csv')
os.remove('train_merged_all_feat.csv')
os.remove('test_merged_all_feat.csv')