In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast

# Data preprocessing

In [3]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

path = ""
sampling_rate=100

# load and convert annotation data
Y = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id', nrows=1000)
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw signal data
X = load_raw_data(Y, sampling_rate, path)

In [4]:
# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [5]:
# Split data into train and test
test_fold = 10
# Train
X_train = X[np.where(Y.strat_fold != test_fold)]
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass
# Test
X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass

In [6]:
y_train[0:40]

ecg_id
1         [NORM]
2         [NORM]
3         [NORM]
4         [NORM]
5         [NORM]
6         [NORM]
7         [NORM]
8           [MI]
10        [NORM]
11        [NORM]
12        [NORM]
13        [NORM]
14        [NORM]
15        [NORM]
16        [NORM]
17            []
18            []
19        [NORM]
20            []
21        [NORM]
22        [STTC]
23            []
24        [NORM]
25        [NORM]
26        [STTC]
27        [NORM]
28        [STTC]
29        [NORM]
30         [HYP]
31        [NORM]
32          [CD]
33        [NORM]
34            []
35        [NORM]
36        [NORM]
37        [NORM]
39    [MI, STTC]
41          [CD]
42        [NORM]
43        [NORM]
Name: diagnostic_superclass, dtype: object

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform y_train
y_train_encoded = mlb.fit_transform(y_train)

# Transform y_test
y_test_encoded = mlb.transform(y_test)


In [8]:
y_train_encoded[0:40]

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

In [9]:
from sklearn.preprocessing import StandardScaler

# Reshape X_train and X_test to 2D arrays
X_train_2d = X_train.reshape(X_train.shape[0], -1)
X_test_2d = X_test.reshape(X_test.shape[0], -1)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform X_train
X_train_normalized = scaler.fit_transform(X_train_2d)

# Transform X_test using the scaler fitted on the training data
X_test_normalized = scaler.transform(X_test_2d)

# Reshape the normalized data back to 3D arrays
X_train_normalized = X_train_normalized.reshape(X_train.shape)
X_test_normalized = X_test_normalized.reshape(X_test.shape)


In [10]:
X_train[0]

array([[-0.119, -0.055,  0.064, ..., -0.026, -0.039, -0.079],
       [-0.116, -0.051,  0.065, ..., -0.031, -0.034, -0.074],
       [-0.12 , -0.044,  0.076, ..., -0.028, -0.029, -0.069],
       ...,
       [ 0.069,  0.   , -0.069, ...,  0.024, -0.041, -0.058],
       [ 0.086,  0.004, -0.081, ...,  0.242, -0.046, -0.098],
       [ 0.022, -0.031, -0.054, ...,  0.143, -0.035, -0.12 ]])

In [11]:
X_train_normalized[0]

array([[-0.58750481, -0.21148667,  0.33338439, ..., -0.11371942,
        -0.11485571, -0.23213584],
       [-0.57455839, -0.20551289,  0.33464201, ..., -0.12555121,
        -0.10439688, -0.22328593],
       [-0.57388145, -0.17260534,  0.37710371, ..., -0.11427138,
        -0.08913142, -0.20902292],
       ...,
       [ 0.29920911,  0.04117644, -0.24856195, ...,  0.03977347,
        -0.14466081, -0.14651976],
       [ 0.33285364,  0.02528715, -0.30445359, ...,  0.66075497,
        -0.19131544, -0.3107622 ],
       [ 0.05570448, -0.1758299 , -0.21779973, ...,  0.33600234,
        -0.17189612, -0.40157453]])

# Shape of training data
```number of examples x (sampling frequency*10 seconds) x number of channels/leads```

In [12]:
X_train_normalized.shape, y_train_encoded.shape

((871, 1000, 12), (871, 5))

# 3-D to 2-D conversion
``` (Research the best way to do it!)```

In [13]:
# X_train_normalized = X_train_normalized.reshape(X_train_normalized.shape[0], -1)
# X_test_normalized = X_test_normalized.reshape(X_test_normalized.shape[0], -1)

# DNN Model

In [67]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Input

# Define the model
model = Sequential([
    Input(shape=(X_train_normalized.shape[1],)), 
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(y_train_encoded.shape[1], activation='sigmoid')  # Sigmoid activation for multi-label classification
])


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [68]:
# Train the model
history = model.fit(X_train_normalized, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test_normalized, y_test_encoded))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [71]:
# Predict on test set
y_pred = model.predict(X_test_normalized)

# Round predictions to convert probabilities to binary values
y_pred_binary = (y_pred > 0.5).astype(int)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_normalized, y_test_encoded)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 1.1445200443267822
Test Accuracy: 0.412966251373291


In [72]:
from sklearn.metrics import f1_score

# Calculate F1 score for each label
f1_scores = f1_score(y_test_encoded, y_pred_binary, average=None)

# Calculate average F1 score across all labels
average_f1_score = f1_score(y_test_encoded, y_pred_binary, average='micro')

# Print F1 score for each label and average F1 score
print("F1 Score for Each Label:", f1_scores)
print("Average F1 Score:", average_f1_score)


F1 Score for Each Label: [0.33505155 0.12871287 0.2939759  0.61851852 0.29910714]
Average F1 Score: 0.42637189103829454
