In [2]:
from imports import *
np.set_printoptions(threshold=np.inf)

In [3]:
# reading new ecg dataset
ecg = pd.read_csv('./datasets/ECG_extracted_filtered_anonym-new.csv')

In [4]:
ecg['Waveform_Type'].value_counts()

Waveform_Type
Rhythm    2687
Median    2684
Name: count, dtype: int64

In [5]:
# choosing Waveform_Type = 'Rhythm' only
ecg_rhythm = ecg[ecg['Waveform_Type'] == 'Rhythm']

# choosing Time2Transplant between -180 and 180 days
ecg_rhythm_tdate = ecg_rhythm[(ecg_rhythm['Time2Transplant'] >= -180) 
                              & (ecg_rhythm['Time2Transplant'] <= 180)]

# removing unnecessary columns: Waveform_Type and Time2Transplant
df_ecg = ecg_rhythm_tdate.drop(['Waveform_Type', 'Time2Transplant'], axis=1) 

# checking if patient MRN is unique and no duplicates are there
is_unique = df_ecg['MRN_formatted_DEID'].nunique() == df_ecg['MRN_formatted_DEID'].count()
print(is_unique)

# rename MRN column
df_ecg.rename(columns={'MRN_formatted_DEID': 'MRN'}, inplace=True)

True


In [6]:
# List all columns except 'MRN'
ecg_columns = [col for col in df_ecg.columns if col != 'MRN']  

In [7]:
# convert signal string to array
def convert_to_array(str_signal):
    # Remove square brackets
    str_signal = str_signal.strip('[]')

    # Replace the ellipsis with a space (if it exists)
    str_signal = str_signal.replace('...', ' ')

    # Split the string into individual numbers
    str_signal = str_signal.split()

    # Convert to integers and then to a NumPy array
    return np.array([float(num) for num in str_signal])

for col in ecg_columns:
    df_ecg[col] = df_ecg[col].apply(convert_to_array) 

In [8]:
def get_length(signal):
    if signal is not None and not pd.isna(signal).all():
        return len(signal)
    else:
        return 0  

# get lengths of ecg signals
df_lengths = df_ecg[ecg_columns].applymap(get_length)

# Find any instances where length is not 6
not_six = df_lengths[df_lengths != 6].dropna(how='all', axis=1)

# Check if there are any
if not not_six.empty:
    print("There are signals with length different from 6.")
    print(not_six)
else:
    print("All signals have a length of 6.")

All signals have a length of 6.


In [9]:
# so no need for padding
# normalization of ecg signals

def normalize_signal(signal):
    min_val = signal.min()
    max_val = signal.max()
    # Avoid division by zero in case all values in the signal are the same
    if max_val - min_val != 0:
        normalized_signal = (signal - min_val) / (max_val - min_val)
        # Cast to float32
        normalized_signal = normalized_signal.astype('float32')
        return normalized_signal
    else:
        return signal
    
for col in ecg_columns:
    df_ecg[col] = df_ecg[col].apply(normalize_signal)


In [10]:
df_ecg.head()



Unnamed: 0,I,II,III,AvR,AvL,AvF,V1,V2,V3,V4,V5,V6,MRN
0,"[1.0, 0.7788945, 0.55778897, 0.0, 0.0, 0.0]","[0.1875, 0.104166664, 0.0, 0.9166667, 1.0, 1.0]","[0.0, 0.1632653, 0.3265306, 0.97959185, 1.0, 1.0]","[0.0, 0.2804878, 0.58536583, 1.0, 0.9634146, 0...","[1.0, 0.8082192, 0.6210046, 0.01369863, 0.0, 0.0]","[0.0, 0.123188406, 0.24637681, 0.9637681, 1.0,...","[0.0, 0.5, 1.0, 0.97619045, 1.0, 1.0]","[0.0, 0.5, 1.0, 0.75, 1.0, 1.0]","[0.05263158, 0.02631579, 0.0, 1.0, 0.94736844,...","[0.0, 0.16666667, 0.33333334, 0.93333334, 1.0,...","[0.0, 0.0, 0.0, 1.0, 0.8, 0.8]","[0.0, 0.03448276, 0.06896552, 1.0, 0.9655172, ...",24264922
2,"[0.87096775, 0.9354839, 1.0, 0.0, 0.0, 0.0]","[1.0, 0.9766355, 0.95327103, 0.0, 0.0, 0.0]","[1.0, 0.93939394, 0.8787879, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 1.0, 1.0]","[0.0, 0.13207547, 0.28301886, 1.0, 1.0, 1.0]","[1.0, 0.96256685, 0.9197861, 0.0, 0.0, 0.0]","[0.2777778, 0.11111111, 0.0, 1.0, 1.0, 1.0]","[0.11764706, 0.0, 0.0, 1.0, 0.9411765, 0.9411765]","[-19.52, -19.52, -19.52, -19.52, -19.52, -19.52]","[1.0, 1.0, 0.95238096, 0.0, 0.0, 0.0]","[0.5, 1.0, 1.0, 0.0, 0.25, 0.25]","[1.0, 1.0, 1.0, 0.06666667, 0.0, 0.0]",25719094
4,"[0.55172414, 0.77011496, 1.0, 0.057471264, 0.0...","[0.0, 0.1724138, 0.3448276, 1.0, 0.82758623, 0...","[0.16, 0.08, 0.0, 1.0, 0.96, 0.96]","[1.0, 0.5, 0.0, 0.73333335, 1.0, 1.0]","[0.7128713, 0.8613861, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.9113924, 0.9113924]","[0.0, 0.0, 0.0, 0.9, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0, 1.0, 1.0]","[0.0, 0.0, 0.0, 0.6666667, 1.0, 1.0]","[0.9298246, 0.9649123, 1.0, 0.0, 0.01754386, 0...","[0.4, 0.2, 0.0, 1.0, 1.0, 1.0]","[0.03448276, 0.03448276, 0.0, 1.0, 1.0, 1.0]",97580870
6,"[0.0, 0.0, 0.0, 1.0, 0.7916667, 0.7916667]","[0.32876712, 0.6712329, 1.0, 0.0, 0.0, 0.06849...","[0.5, 0.75, 1.0, 0.0, 0.05, 0.1]","[0.9230769, 0.46153846, 0.0, 0.9230769, 1.0, 0...","[0.39344263, 0.19672132, 0.0, 1.0, 0.91803277,...","[0.43023255, 0.7093023, 1.0, 0.0, 0.034883723,...","[0.875, 0.9375, 1.0, 0.0, 0.0, 0.0]","[0.071428575, 0.035714287, 0.0, 1.0, 0.9642857...","[0.15384616, 0.07692308, 0.0, 1.0, 1.0, 1.0]","[1.0, 0.95, 0.9, 0.0, 0.05, 0.05]","[0.22222222, 0.11111111, 0.0, 1.0, 0.8888889, ...","[0.0, 0.09090909, 0.18181819, 1.0, 1.0, 1.0]",77281657
8,"[1.0, 1.0, 1.0, 0.05, 0.0, 0.0]","[0.08064516, 0.04032258, 0.0, 1.0, 1.0, 1.0]","[0.04347826, 0.02173913, 0.0, 0.9782609, 1.0, ...","[0.6666667, 0.8333333, 1.0, 0.0, 0.16666667, 0...","[0.9691358, 0.9845679, 1.0, 0.030864198, 0.0, ...","[0.057471264, 0.028735632, 0.0, 0.9856322, 1.0...","[0.0, 0.0, 0.0, 1.0, 1.0, 1.0]","[0.0, 0.055555556, 0.11111111, 0.8888889, 1.0,...","[0.0, 0.07692308, 0.15384616, 0.84615386, 1.0,...","[1.0, 1.0, 1.0, 0.0, 0.04761905, 0.04761905]","[0.06896552, 0.03448276, 0.0, 1.0, 0.98275864,...","[0.051282052, 0.025641026, 0.0, 1.0, 0.9871794...",11932235


In [11]:
# get previous ecg data to get target columns and split into train, val and test
x_train_ecg = pd.read_csv('./processed-datasets/x_train_ecg.csv')
x_val_ecg = pd.read_csv('./processed-datasets/x_val_ecg.csv')
x_test_ecg = pd.read_csv('./processed-datasets/x_test_ecg.csv')

In [12]:
# renaming MRN column
x_train_ecg.rename(columns={'MRN_DEID': 'MRN'}, inplace=True)
x_val_ecg.rename(columns={'MRN_DEID': 'MRN'}, inplace=True)
x_test_ecg.rename(columns={'MRN_DEID': 'MRN'}, inplace=True)


In [13]:
# Merging the training set
df_ecg_train = pd.merge(df_ecg, x_train_ecg[['MRN', 'CardiacFuture']], on='MRN', how='inner')

# Merging the validation set
df_ecg_val = pd.merge(df_ecg, x_val_ecg[['MRN', 'CardiacFuture']], on='MRN', how='inner')

# Merging the test set
df_ecg_test = pd.merge(df_ecg, x_test_ecg[['MRN', 'CardiacFuture']], on='MRN', how='inner')


In [14]:
# renaming CardiacFuture column to y
df_ecg_train.rename(columns={'CardiacFuture': 'y'}, inplace=True)
df_ecg_val.rename(columns={'CardiacFuture': 'y'}, inplace=True)
df_ecg_test.rename(columns={'CardiacFuture': 'y'}, inplace=True)

### some patients removed in df_ecg train, val, test, make sure the patients are same in tabular train, val, test later when doing joint fusion model

## saving datasets in processed-datasets folder to be load in another notebook

In [15]:
df_ecg_train.to_pickle("./processed-datasets/df_ecg_train.pkl")
df_ecg_val.to_pickle("./processed-datasets/df_ecg_val.pkl")
df_ecg_test.to_pickle("./processed-datasets/df_ecg_test.pkl")
