# Data Preprocessing

In [None]:
# load library
import numpy as np
import pandas as pd
import zipfile
import urllib.request
import io
import os
import warnings
import sys
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn.feature_selection import f_classif
from scipy.stats import entropy
warnings.filterwarnings('ignore')

In [None]:
# load raw data
url = "https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/ehr_preprocessed_seq_by_day_cat_embedding.pkl.zip"
with urllib.request.urlopen(url) as response:
    with zipfile.ZipFile(io.BytesIO(response.read())) as zip_file:
        with zip_file.open("ehr_preprocessed_seq_by_day_cat_embedding.pkl", "r") as file:
            EHR = pd.read_pickle(file)
train = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/train.csv")
valid = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/valid.csv")
test = pd.read_csv("https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/raw_data/test.csv")
print(train.shape)
print(test.shape)
print(valid.shape)
print(len(EHR['feature_cols']))

(55941, 13)
(17933, 12)
(13598, 13)
171


In [None]:
# drop image-related columns
train = train.drop(train.columns[[6,7,8,11]], axis=1)
valid = valid.drop(valid.columns[[6,7,8,11]], axis=1)
test = test.drop(test.columns[[6,7,8,11]], axis=1)
train.head(5)

Unnamed: 0,id,subject_id,hadm_id,admittime,dischtime,deathtime,StudyDate,StudyTime,readmitted_within_30days
0,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410828,90151.343,0
1,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410831,130627.031,0
2,10869829_25238191,10869829,25238191,2141-08-20 12:41:00,2141-09-01 13:22:00,,21410821,80228.937,0
3,17910612_22301530,17910612,22301530,2188-03-04 19:49:00,2188-04-19 00:00:00,2188-04-19 00:00:00,21880405,200636.062,1
4,17910612_22301530,17910612,22301530,2188-03-04 19:49:00,2188-04-19 00:00:00,2188-04-19 00:00:00,21880321,114135.046,1


In [None]:
# extract last-day observations
train['StudyDate'] = pd.to_datetime(train['StudyDate'], format='%Y%m%d')
train_latest_date_idxs = train.groupby('id')['StudyDate'].idxmax()
train_masked = train.loc[train_latest_date_idxs]

valid['StudyDate'] = pd.to_datetime(valid['StudyDate'], format='%Y%m%d')
valid_latest_date_idxs = valid.groupby('id')['StudyDate'].idxmax()
valid_masked = valid.loc[valid_latest_date_idxs]

test['StudyDate'] = pd.to_datetime(test['StudyDate'], format='%Y%m%d')
test_latest_date_idxs = test.groupby('id')['StudyDate'].idxmax()
test_masked = test.loc[test_latest_date_idxs]

print(train_masked.shape)
print(valid_masked.shape)
print(test_masked.shape)

(9271, 13)
(2325, 13)
(2936, 12)


In [None]:
# merge train and valid
train_valid = pd.concat([train, valid])
train_valid_masked = pd.concat([train_masked, valid_masked])
print(train_valid.shape)
print(train_valid_masked.shape)

(69539, 13)
(11596, 13)


In [None]:
# add two new predictors (length of stay + previous number of admissions)
# length of stay
test_masked['dischtime'] = pd.to_datetime(test_masked['dischtime'])
test_masked['admittime'] = pd.to_datetime(test_masked['admittime'])
test_masked['stay_len'] = (test_masked['dischtime'] - test_masked['admittime']).dt.days
valid_masked['dischtime'] = pd.to_datetime(valid_masked['dischtime'])
valid_masked['admittime'] = pd.to_datetime(valid_masked['admittime'])
valid_masked['stay_len'] = (valid_masked['dischtime'] - valid_masked['admittime']).dt.days
train_valid_masked['dischtime'] = pd.to_datetime(train_valid_masked['dischtime'])
train_valid_masked['admittime'] = pd.to_datetime(train_valid_masked['admittime'])
train_valid_masked['stay_len'] = (train_valid_masked['dischtime'] - train_valid_masked['admittime']).dt.days
train_masked['dischtime'] = pd.to_datetime(train_masked['dischtime'])
train_masked['admittime'] = pd.to_datetime(train_masked['admittime'])
train_masked['stay_len'] = (train_masked['dischtime'] - train_masked['admittime']).dt.days

# previous number of admissions
test_masked = test_masked.sort_values('admittime')
test_masked['prev_admits'] = test_masked.groupby('subject_id').cumcount()
valid_masked = valid_masked.sort_values('admittime')
valid_masked['prev_admits'] = valid_masked.groupby('subject_id').cumcount()
train_valid_masked = train_valid_masked.sort_values('admittime')
train_valid_masked['prev_admits'] = train_valid_masked.groupby('subject_id').cumcount()
train_masked = train_masked.sort_values('admittime')
train_masked['prev_admits'] = train_masked.groupby('subject_id').cumcount()

train_masked.head(5)

Unnamed: 0,id,subject_id,hadm_id,admittime,dischtime,deathtime,StudyDate,StudyTime,readmitted_within_30days,stay_len,prev_admits
41856,17195991_23542772,17195991,23542772,2110-01-11 22:47:00,2110-01-18 10:25:00,,2110-01-16,90654.546,0,6,0
24332,13721591_20342223,13721591,20342223,2110-02-09 18:13:00,2110-02-22 20:51:00,,2110-02-19,41948.468,0,13,0
20119,19170541_22178312,19170541,22178312,2110-02-28 21:48:00,2110-03-12 17:47:00,,2110-03-11,81842.812,0,11,0
51567,15554295_27705504,15554295,27705504,2110-03-09 03:54:00,2110-05-18 11:34:00,,2110-05-04,60653.312,0,70,0
29152,17643026_29919541,17643026,29919541,2110-03-25 11:15:00,2110-03-29 17:17:00,,2110-03-28,140521.453,0,4,0


In [None]:
# extract feature column names
cat_cols = [EHR['feature_cols'][i] for i in EHR['cat_idxs']]
icd_cols = EHR['icd_cols']
cat_cols.extend(icd_cols)
num_cols = list(set(EHR['feature_cols']) - set(cat_cols))
#num_cols.extend(['stay_len', 'prev_admits'])
print(cat_cols)
print(num_cols)

['gender', 'ethnicity', 'Creatinine Blood', 'Sodium Blood', 'pO2 Blood', 'Basophils Other Body Fluid', 'Basophils Pleural', 'Lactate Blood', 'Anion Gap Blood', 'Eosinophils Joint Fluid', 'Hemoglobin Blood', 'Chloride Blood', 'Eosinophils Ascites', 'pH Urine', 'Calcium, Total Blood', 'Lymphocytes Ascites', 'Eosinophils Other Body Fluid', 'Eosinophils Blood', 'Lymphocytes Blood', 'Basophils Joint Fluid', 'Hematocrit Blood', 'Potassium Blood', 'H Blood', 'Monocytes Blood', 'Eosinophils Pleural', 'Troponin T Blood', 'Neutrophils Blood', 'Bicarbonate Blood', 'Basophils Blood', 'Glucose Blood', 'Basophils Ascites', 'pH Blood', 'Platelet Count Blood', 'Lymphocytes Other Body Fluid', 'Monocytes Ascites', 'Lymphocytes Joint Fluid', 'Lymphocytes Pleural', 'pCO2 Blood', 'Y90-Y99', 'G30-G32', 'O85-O92', 'C60-C63', 'F40-F48', 'M80-M85', 'R00-R09', 'J90-J94', 'A00-A09', 'E00-E07', 'F01-F09', 'F30-F39', 'H30-H36', 'D60-D64', 'N00-N08', 'F60-F69', 'I80-I89', 'I95-I99', 'N30-N39', 'K55-K64', 'F50-F59',

In [None]:
# train_valid set
train_valid_merged_admits_all = []
train_valid_merged_admits_latest = []
train_valid_merged_admits_mean_mode = []
train_valid_merged_admits_entropy_std = []
train_valid_merged_admits_max = []
train_valid_merged_admits_min = []
train_valid_merged_admits_q1 = []
train_valid_merged_admits_q3 = []
train_valid_merged_admits_kurtosis = []
train_valid_merged_admits_skewness = []
train_valid_merged_admits_iqr = []
train_valid_merged_admits_range = []

for _, admit in train_valid_masked.iterrows():
    num_feat_arrs = len(EHR['feat_dict'][admit['id']])
    temp_dfs = []
    for i in range(num_feat_arrs):
        feat_arr = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
        merged = admit.append(feat_arr)
        merged['obs_id'] = i + 1
        temp_dfs.append(merged.to_frame().transpose())
        train_valid_merged_admits_all.append(merged)
        if i == num_feat_arrs - 1:
            # latest data
            feat_arr_latest = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
            train_valid_merged_admits_latest.append(admit.append(feat_arr_latest))
            # mean and mode
            temp_df = pd.concat(temp_dfs, axis=0)
            temp_df[num_cols] = temp_df[num_cols].astype(float)
            feat_arr_mean_mode = temp_df[num_cols].mean().append(temp_df[cat_cols].mode().iloc[0])
            train_valid_merged_admits_mean_mode.append(admit.append(feat_arr_mean_mode))
            # entropy and std
            feat_arr_entropy_std = temp_df[num_cols].std().append(temp_df[cat_cols].apply(lambda x: entropy(x.value_counts()), axis=0))
            train_valid_merged_admits_entropy_std.append(admit.append(feat_arr_entropy_std))
            # maximum and minimum
            feat_arr_max = temp_df[num_cols].max()
            train_valid_merged_admits_max.append(admit.append(feat_arr_max))
            feat_arr_min = temp_df[num_cols].min()
            train_valid_merged_admits_min.append(admit.append(feat_arr_min))
            # Q1 and Q3
            feat_arr_q1 = temp_df[num_cols].quantile(0.25)
            train_valid_merged_admits_q1.append(admit.append(feat_arr_q1))
            feat_arr_q3 = temp_df[num_cols].quantile(0.75)
            train_valid_merged_admits_q3.append(admit.append(feat_arr_q3))
            # kurtosis
            feat_arr_kurtosis = temp_df[num_cols].kurtosis()
            train_valid_merged_admits_kurtosis.append(admit.append(feat_arr_kurtosis))
            # skewness
            feat_arr_skewness = temp_df[num_cols].skew()
            train_valid_merged_admits_skewness.append(admit.append(feat_arr_skewness))
            # IQR
            feat_arr_iqr = temp_df[num_cols].quantile(0.75) - temp_df[num_cols].quantile(0.25)
            train_valid_merged_admits_iqr.append(admit.append(feat_arr_iqr))
            # range
            feat_arr_range = temp_df[num_cols].max() - temp_df[num_cols].min()
            train_valid_merged_admits_range.append(admit.append(feat_arr_range))

train_valid_merged_df_all = pd.DataFrame(train_valid_merged_admits_all)
train_valid_merged_df_latest = pd.DataFrame(train_valid_merged_admits_latest)
train_valid_merged_df_mean_mode = pd.DataFrame(train_valid_merged_admits_mean_mode)
train_valid_merged_df_entropy_std = pd.DataFrame(train_valid_merged_admits_entropy_std)
train_valid_merged_df_max = pd.DataFrame(train_valid_merged_admits_max)
train_valid_merged_df_min = pd.DataFrame(train_valid_merged_admits_min)
train_valid_merged_df_q1 = pd.DataFrame(train_valid_merged_admits_q1)
train_valid_merged_df_q3 = pd.DataFrame(train_valid_merged_admits_q3)
train_valid_merged_df_kurtosis = pd.DataFrame(train_valid_merged_admits_kurtosis)
train_valid_merged_df_skewness = pd.DataFrame(train_valid_merged_admits_skewness)
train_valid_merged_df_iqr = pd.DataFrame(train_valid_merged_admits_iqr)
train_valid_merged_df_range = pd.DataFrame(train_valid_merged_admits_range)

# test set
test_merged_admits_all = []
test_merged_admits_latest = []
test_merged_admits_mean_mode = []
test_merged_admits_entropy_std = []
test_merged_admits_max = []
test_merged_admits_min = []
test_merged_admits_q1 = []
test_merged_admits_q3 = []
test_merged_admits_kurtosis = []
test_merged_admits_skewness = []
test_merged_admits_iqr = []
test_merged_admits_range = []

for _, admit in test_masked.iterrows():
    num_feat_arrs = len(EHR['feat_dict'][admit['id']])
    temp_dfs = []
    for i in range(num_feat_arrs):
        feat_arr = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
        merged = admit.append(feat_arr)
        merged['obs_id'] = i + 1
        temp_dfs.append(merged.to_frame().transpose())
        test_merged_admits_all.append(merged)
        if i == num_feat_arrs - 1:
            # latest data
            feat_arr_latest = pd.Series(EHR['feat_dict'][admit['id']][i], index=EHR['feature_cols'])
            test_merged_admits_latest.append(admit.append(feat_arr_latest))
            # mean and mode
            temp_df = pd.concat(temp_dfs, axis=0)
            temp_df[num_cols] = temp_df[num_cols].astype(float)
            feat_arr_mean_mode = temp_df[num_cols].mean().append(temp_df[cat_cols].mode().iloc[0])
            test_merged_admits_mean_mode.append(admit.append(feat_arr_mean_mode))
            # entropy and std
            feat_arr_entropy_std = temp_df[num_cols].std().append(temp_df[cat_cols].apply(lambda x: entropy(x.value_counts()), axis=0))
            test_merged_admits_entropy_std.append(admit.append(feat_arr_entropy_std))
            # maximum and minimum
            feat_arr_max = temp_df[num_cols].max()
            test_merged_admits_max.append(admit.append(feat_arr_max))
            feat_arr_min = temp_df[num_cols].min()
            test_merged_admits_min.append(admit.append(feat_arr_min))
            # Q1 and Q3
            feat_arr_q1 = temp_df[num_cols].quantile(0.25)
            test_merged_admits_q1.append(admit.append(feat_arr_q1))
            feat_arr_q3 = temp_df[num_cols].quantile(0.75)
            test_merged_admits_q3.append(admit.append(feat_arr_q3))
            # kurtosis
            feat_arr_kurtosis = temp_df[num_cols].kurtosis()
            test_merged_admits_kurtosis.append(admit.append(feat_arr_kurtosis))
            # skewness
            feat_arr_skewness = temp_df[num_cols].skew()
            test_merged_admits_skewness.append(admit.append(feat_arr_skewness))
            # IQR
            feat_arr_iqr = temp_df[num_cols].quantile(0.75) - temp_df[num_cols].quantile(0.25)
            test_merged_admits_iqr.append(admit.append(feat_arr_iqr))
            # range
            feat_arr_range = temp_df[num_cols].max() - temp_df[num_cols].min()
            test_merged_admits_range.append(admit.append(feat_arr_range))

test_merged_df_all = pd.DataFrame(test_merged_admits_all)
test_merged_df_latest = pd.DataFrame(test_merged_admits_latest)
test_merged_df_mean_mode = pd.DataFrame(test_merged_admits_mean_mode)
test_merged_df_entropy_std = pd.DataFrame(test_merged_admits_entropy_std)
test_merged_df_max = pd.DataFrame(test_merged_admits_max)
test_merged_df_min = pd.DataFrame(test_merged_admits_min)
test_merged_df_q1 = pd.DataFrame(test_merged_admits_q1)
test_merged_df_q3 = pd.DataFrame(test_merged_admits_q3)
test_merged_df_kurtosis = pd.DataFrame(test_merged_admits_kurtosis)
test_merged_df_skewness = pd.DataFrame(test_merged_admits_skewness)
test_merged_df_iqr = pd.DataFrame(test_merged_admits_iqr)
test_merged_df_range = pd.DataFrame(test_merged_admits_range)


In [None]:
# standardization for numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

cat_cols = [EHR['feature_cols'][i] for i in EHR['cat_idxs']]
icd_cols = EHR['icd_cols']
num_cols = list(set(EHR['feature_cols']) - set(cat_cols) - set(icd_cols))
num_cols.extend(['stay_len', 'prev_admits'])

train_valid_merged_df_all[num_cols] = scaler.fit_transform(train_valid_merged_df_all[num_cols])
train_valid_merged_df_mean_mode[num_cols] = scaler.fit_transform(train_valid_merged_df_mean_mode[num_cols])
train_valid_merged_df_latest[num_cols] = scaler.fit_transform(train_valid_merged_df_latest[num_cols])
train_valid_merged_df_entropy_std[num_cols] = scaler.fit_transform(train_valid_merged_df_entropy_std[num_cols])
train_valid_merged_df_min[num_cols] = scaler.fit_transform(train_valid_merged_df_min[num_cols])
train_valid_merged_df_max[num_cols] = scaler.fit_transform(train_valid_merged_df_max[num_cols])
train_valid_merged_df_q1[num_cols] = scaler.fit_transform(train_valid_merged_df_q1[num_cols])
train_valid_merged_df_q3[num_cols] = scaler.fit_transform(train_valid_merged_df_q3[num_cols])
train_valid_merged_df_kurtosis[num_cols] = scaler.fit_transform(train_valid_merged_df_kurtosis[num_cols])
train_valid_merged_df_skewness[num_cols] = scaler.fit_transform(train_valid_merged_df_skewness[num_cols])
train_valid_merged_df_iqr[num_cols] = scaler.fit_transform(train_valid_merged_df_iqr[num_cols])
train_valid_merged_df_range[num_cols] = scaler.fit_transform(train_valid_merged_df_range[num_cols])

test_merged_df_all[num_cols] = scaler.fit_transform(test_merged_df_all[num_cols])
test_merged_df_mean_mode[num_cols] = scaler.fit_transform(test_merged_df_mean_mode[num_cols])
test_merged_df_latest[num_cols] = scaler.fit_transform(test_merged_df_latest[num_cols])
test_merged_df_entropy_std[num_cols] = scaler.fit_transform(test_merged_df_entropy_std[num_cols])
test_merged_df_min[num_cols] = scaler.fit_transform(test_merged_df_min[num_cols])
test_merged_df_max[num_cols] = scaler.fit_transform(test_merged_df_max[num_cols])
test_merged_df_q1[num_cols] = scaler.fit_transform(test_merged_df_q1[num_cols])
test_merged_df_q3[num_cols] = scaler.fit_transform(test_merged_df_q3[num_cols])
test_merged_df_kurtosis[num_cols] = scaler.fit_transform(test_merged_df_kurtosis[num_cols])
test_merged_df_skewness[num_cols] = scaler.fit_transform(test_merged_df_skewness[num_cols])
test_merged_df_iqr[num_cols] = scaler.fit_transform(test_merged_df_iqr[num_cols])
test_merged_df_range[num_cols] = scaler.fit_transform(test_merged_df_range[num_cols])

In [None]:
# drop features with constant values in the training set
train_valid_merged_df_all.insert(2, 'obs_id', train_valid_merged_df_all.pop('obs_id'))
test_merged_df_all.insert(2, 'obs_id', test_merged_df_all.pop('obs_id'))

feat_cols_all = train_valid_merged_df_all.columns[14:].to_list()
feat_cols_latest = train_valid_merged_df_latest.columns[13:].to_list()
feat_cols_mean_mode = train_valid_merged_df_mean_mode.columns[13:].to_list()
feat_cols_entropy_std = train_valid_merged_df_entropy_std.columns[13:].to_list()
feat_cols_min = train_valid_merged_df_min.columns[13:].to_list()
feat_cols_max = train_valid_merged_df_max.columns[13:].to_list()
feat_cols_q1 = train_valid_merged_df_q1.columns[13:].to_list()
feat_cols_q3 = train_valid_merged_df_q3.columns[13:].to_list()
feat_cols_kurtosis = train_valid_merged_df_kurtosis.columns[13:].to_list()
feat_cols_skewness = train_valid_merged_df_skewness.columns[13:].to_list()
feat_cols_iqr = train_valid_merged_df_iqr.columns[13:].to_list()
feat_cols_range = train_valid_merged_df_range.columns[13:].to_list()

const_cols_all = [col for col in feat_cols_all if train_valid_merged_df_all[col].nunique() <= 1]
const_cols_latest = [col for col in feat_cols_latest if train_valid_merged_df_latest[col].nunique() <= 1]
const_cols_mean_mode = [col for col in feat_cols_mean_mode if train_valid_merged_df_mean_mode[col].nunique() <= 1]
const_cols_entropy_std = [col for col in feat_cols_entropy_std if train_valid_merged_df_entropy_std[col].nunique() <= 1]
const_cols_min = [col for col in feat_cols_min if train_valid_merged_df_min[col].nunique() <= 1]
const_cols_max = [col for col in feat_cols_max if train_valid_merged_df_max[col].nunique() <= 1]
const_cols_q1 = [col for col in feat_cols_q1 if train_valid_merged_df_q1[col].nunique() <= 1]
const_cols_q3 = [col for col in feat_cols_q3 if train_valid_merged_df_q3[col].nunique() <= 1]
const_cols_kurtosis = [col for col in feat_cols_kurtosis if train_valid_merged_df_kurtosis[col].nunique() <= 1]
const_cols_skewness = [col for col in feat_cols_skewness if train_valid_merged_df_skewness[col].nunique() <= 1]
const_cols_iqr = [col for col in feat_cols_iqr if train_valid_merged_df_iqr[col].nunique() <= 1]
const_cols_range = [col for col in feat_cols_range if train_valid_merged_df_range[col].nunique() <= 1]

train_valid_merged_df_all = train_valid_merged_df_all.drop(columns=const_cols_all, axis=1)
test_merged_df_all = test_merged_df_all.drop(columns=const_cols_all, axis=1)

train_valid_merged_df_latest = train_valid_merged_df_latest.drop(columns=const_cols_latest, axis=1)
test_merged_df_latest = test_merged_df_latest.drop(columns=const_cols_latest, axis=1)

train_valid_merged_df_mean_mode = train_valid_merged_df_mean_mode.drop(columns=const_cols_mean_mode, axis=1)
test_merged_df_mean_mode = test_merged_df_mean_mode.drop(columns=const_cols_mean_mode, axis=1)

train_valid_merged_df_entropy_std = train_valid_merged_df_entropy_std.drop(columns=const_cols_entropy_std, axis=1)
test_merged_df_entropy_std = test_merged_df_entropy_std.drop(columns=const_cols_entropy_std, axis=1)

train_valid_merged_df_min = train_valid_merged_df_min.drop(columns=const_cols_min, axis=1)
test_merged_df_min = test_merged_df_min.drop(columns=const_cols_min, axis=1)

train_valid_merged_df_max = train_valid_merged_df_max.drop(columns=const_cols_max, axis=1)
test_merged_df_max = test_merged_df_max.drop(columns=const_cols_max, axis=1)

train_valid_merged_df_q1 = train_valid_merged_df_q1.drop(columns=const_cols_q1, axis=1)
test_merged_df_q1 = test_merged_df_q1.drop(columns=const_cols_q1, axis=1)

train_valid_merged_df_q3 = train_valid_merged_df_q3.drop(columns=const_cols_q3, axis=1)
test_merged_df_q3 = test_merged_df_q3.drop(columns=const_cols_q3, axis=1)

train_valid_merged_df_kurtosis = train_valid_merged_df_kurtosis.drop(columns=const_cols_kurtosis, axis=1)
test_merged_df_kurtosis = test_merged_df_kurtosis.drop(columns=const_cols_kurtosis, axis=1)

train_valid_merged_df_skewness = train_valid_merged_df_skewness.drop(columns=const_cols_skewness, axis=1)
test_merged_df_skewness = test_merged_df_skewness.drop(columns=const_cols_skewness, axis=1)

train_valid_merged_df_iqr = train_valid_merged_df_iqr.drop(columns=const_cols_iqr, axis=1)
test_merged_df_iqr = test_merged_df_iqr.drop(columns=const_cols_iqr, axis=1)

train_valid_merged_df_range = train_valid_merged_df_range.drop(columns=const_cols_range, axis=1)
test_merged_df_range = test_merged_df_range.drop(columns=const_cols_range, axis=1)

print(train_valid_merged_df_all.shape)
print(train_valid_merged_df_latest.shape)
print(train_valid_merged_df_mean_mode.shape)
print(train_valid_merged_df_entropy_std.shape)
print(train_valid_merged_df_min.shape)
print(train_valid_merged_df_max.shape)
print(train_valid_merged_df_q1.shape)
print(train_valid_merged_df_q3.shape)
print(train_valid_merged_df_kurtosis.shape)
print(train_valid_merged_df_skewness.shape)
print(train_valid_merged_df_iqr.shape)
print(train_valid_merged_df_range.shape)

(173154, 183)
(11596, 175)
(11596, 169)
(11596, 77)
(11596, 55)
(11596, 55)
(11596, 55)
(11596, 55)
(11596, 38)
(11596, 38)
(11596, 23)
(11596, 38)


In [None]:
print(test_merged_df_all.shape)
print(test_merged_df_latest.shape)
print(test_merged_df_mean_mode.shape)
print(test_merged_df_entropy_std.shape)
print(test_merged_df_min.shape)
print(test_merged_df_max.shape)
print(test_merged_df_q1.shape)
print(test_merged_df_q3.shape)
print(test_merged_df_kurtosis.shape)
print(test_merged_df_skewness.shape)
print(test_merged_df_iqr.shape)
print(test_merged_df_range.shape)

(46076, 182)
(2936, 174)
(2936, 168)
(2936, 76)
(2936, 54)
(2936, 54)
(2936, 54)
(2936, 54)
(2936, 37)
(2936, 37)
(2936, 22)
(2936, 37)


In [None]:
# add suffix
train_valid_merged_df_entropy_std.columns = list(train_valid_merged_df_entropy_std.columns[:13]) + [str(col) + '_entropy_std' for col in train_valid_merged_df_entropy_std.columns[13:]]
test_merged_df_entropy_std.columns = list(test_merged_df_entropy_std.columns[:12]) + [str(col) + '_entropy_std' for col in test_merged_df_entropy_std.columns[12:]]

train_valid_merged_df_mean_mode.columns = list(train_valid_merged_df_mean_mode.columns[:13]) + [str(col) + '_mean_mode' for col in train_valid_merged_df_mean_mode.columns[13:]]
test_merged_df_mean_mode.columns = list(test_merged_df_mean_mode.columns[:12]) + [str(col) + '_mean_mode' for col in test_merged_df_mean_mode.columns[12:]]

train_valid_merged_df_latest.columns = list(train_valid_merged_df_latest.columns[:13]) + [str(col) + '_latest' for col in train_valid_merged_df_latest.columns[13:]]
test_merged_df_latest.columns = list(test_merged_df_latest.columns[:12]) + [str(col) + '_latest' for col in test_merged_df_latest.columns[12:]]

train_valid_merged_df_all.columns = list(train_valid_merged_df_all.columns[:13]) + [str(col) + '_all' for col in train_valid_merged_df_all.columns[13:]]
test_merged_df_all.columns = list(test_merged_df_all.columns[:12]) + [str(col) + '_all' for col in test_merged_df_all.columns[12:]]

train_valid_merged_df_min.columns = list(train_valid_merged_df_min.columns[:13]) + [str(col) + '_min' for col in train_valid_merged_df_min.columns[13:]]
test_merged_df_min.columns = list(test_merged_df_min.columns[:12]) + [str(col) + '_min' for col in test_merged_df_min.columns[12:]]

train_valid_merged_df_max.columns = list(train_valid_merged_df_max.columns[:13]) + [str(col) + '_max' for col in train_valid_merged_df_max.columns[13:]]
test_merged_df_max.columns = list(test_merged_df_max.columns[:12]) + [str(col) + '_max' for col in test_merged_df_max.columns[12:]]

train_valid_merged_df_q1.columns = list(train_valid_merged_df_q1.columns[:13]) + [str(col) + '_q1' for col in train_valid_merged_df_q1.columns[13:]]
test_merged_df_q1.columns = list(test_merged_df_q1.columns[:12]) + [str(col) + '_q1' for col in test_merged_df_q1.columns[12:]]

train_valid_merged_df_q3.columns = list(train_valid_merged_df_q3.columns[:13]) + [str(col) + '_q3' for col in train_valid_merged_df_q3.columns[13:]]
test_merged_df_q3.columns = list(test_merged_df_q3.columns[:12]) + [str(col) + '_q3' for col in test_merged_df_q3.columns[12:]]

train_valid_merged_df_kurtosis.columns = list(train_valid_merged_df_kurtosis.columns[:13]) + [str(col) + '_kurtosis' for col in train_valid_merged_df_kurtosis.columns[13:]]
test_merged_df_kurtosis.columns = list(test_merged_df_kurtosis.columns[:12]) + [str(col) + '_kurtosis' for col in test_merged_df_kurtosis.columns[12:]]

train_valid_merged_df_skewness.columns = list(train_valid_merged_df_skewness.columns[:13]) + [str(col) + '_skewness' for col in train_valid_merged_df_skewness.columns[13:]]
test_merged_df_skewness.columns = list(test_merged_df_skewness.columns[:12]) + [str(col) + '_skewness' for col in test_merged_df_skewness.columns[12:]]

train_valid_merged_df_iqr.columns = list(train_valid_merged_df_iqr.columns[:13]) + [str(col) + '_iqr' for col in train_valid_merged_df_iqr.columns[13:]]
test_merged_df_iqr.columns = list(test_merged_df_iqr.columns[:12]) + [str(col) + '_iqr' for col in test_merged_df_iqr.columns[12:]]

train_valid_merged_df_range.columns = list(train_valid_merged_df_range.columns[:13]) + [str(col) + '_range' for col in train_valid_merged_df_range.columns[13:]]
test_merged_df_range.columns = list(test_merged_df_range.columns[:12]) + [str(col) + '_range' for col in test_merged_df_range.columns[12:]]

In [None]:
# write csv files
# train set
train_valid_merged_df_latest.to_csv('train/train_valid_latest.csv', index=False)
train_valid_merged_df_mean_mode.to_csv('train/train_valid_mean_mode.csv', index=False)
train_valid_merged_df_all.to_csv('train/train_valid_all.csv', index=False)
train_valid_merged_df_entropy_std.to_csv('train/train_valid_entropy_std.csv', index=False)
train_valid_merged_df_min.to_csv('train/train_valid_min.csv', index=False)
train_valid_merged_df_max.to_csv('train/train_valid_max.csv', index=False)
train_valid_merged_df_q1.to_csv('train/train_valid_q1.csv', index=False)
train_valid_merged_df_q3.to_csv('train/train_valid_q3.csv', index=False)
train_valid_merged_df_kurtosis.to_csv('train/train_valid_kurtosis.csv', index=False)
train_valid_merged_df_skewness.to_csv('train/train_valid_skewness.csv', index=False)
train_valid_merged_df_iqr.to_csv('train/train_valid_iqr.csv', index=False)
train_valid_merged_df_range.to_csv('train/train_valid_range.csv', index=False)
with zipfile.ZipFile('train/sequence_data.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('train/train_valid_all.csv')
os.remove('train/train_valid_all.csv')

# test set
test_merged_df_latest.to_csv('test/test_latest.csv', index=False)
test_merged_df_mean_mode.to_csv('test/test_mean_mode.csv', index=False)
test_merged_df_entropy_std.to_csv('test/test_entropy_std.csv', index=False)
test_merged_df_all.to_csv('test/test_all.csv', index=False)
test_merged_df_min.to_csv('test/test_min.csv', index=False)
test_merged_df_max.to_csv('test/test_max.csv', index=False)
test_merged_df_q1.to_csv('test/test_q1.csv', index=False)
test_merged_df_q3.to_csv('test/test_q3.csv', index=False)
test_merged_df_kurtosis.to_csv('test/test_kurtosis.csv', index=False)
test_merged_df_skewness.to_csv('test/test_skewness.csv', index=False)
test_merged_df_iqr.to_csv('test/test_iqr.csv', index=False)
test_merged_df_range.to_csv('test/test_range.csv', index=False)
with zipfile.ZipFile('test/sequence_data.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('test/test_all.csv')
os.remove('test/test_all.csv')