In [50]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

### Load Labels

In [None]:
diagnostic = pd.read_csv("Diagnostics.csv")
print(Counter(diagnostic["Rhythm"]))

### Merge 11 classes into 4

In [3]:
# SA - SI (Sinus Irregularity)

merge_dict = {
              "AFIB": "AFIB", "AF": "AFIB",
              "SVT": "GSVT", "AT": "GSVT", "SAAWR": "GSVT", "ST":"GSVT", "AVNRT":"GSVT", "AVRT": "GSVT",
              "SB": "SB",
              "SR": "SR", "SA":"SR"
             }

In [None]:
rhytm_col = []
for r in diagnostic["Rhythm"]:
    rhytm_col.append(merge_dict[r])

print(Counter(rhytm_col))

In [5]:
diagnostic['label'] = rhytm_col 

In [None]:
print(diagnostic[["FileName", "Rhythm", "label"]])

### Load ECG Data

In [None]:
path = os.getcwd()
data_path = path + '/ECGDataDenoised/'
files = os.listdir(data_path)
print('Total files:', len(files))

In [None]:
mapping = dict()
for fn, l in zip(diagnostic["FileName"], diagnostic["label"]):
    mapping[fn] = l

print(len(mapping))

### Load Denoised Data

In [None]:
x = []
y = []
for f in tqdm(files, desc = "Loading Data"):
    file_stats = os.stat(data_path + f)
    file_size = file_stats.st_size / 1024
    if file_size < 425:
        continue
    
    fn = f.split('.')[0]
    df = pd.read_csv(data_path + f, header=None)
    data = df.to_numpy().transpose()

    if df.isnull().values.any() or np.isnan(data).any():
        print("NaN Error file: ", fn)
        is_NaN = df.isnull()
        row_has_NaN = is_NaN.any(axis=1)
        rows_with_NaN = df[row_has_NaN]
        print(rows_with_NaN)
        break

    if 0 in data:
        print("Zero Error file: ", fn)
        break
    
    if data.shape[0] != 12 or data.shape[1] != 5000:
        print("Shape Error in file: ", fn)
        break
    
    x.append(data)
    y.append(mapping[fn])

In [None]:
print("x length: ", len(x), ", y length: ", len(y))

In [None]:
x = np.array(x)
print(x.shape)
print(Counter(y))

In [None]:
# Tranform single dimension to 2 dimensions
y_np = np.array(y)
y_np = y_np.reshape(-1, 1)
print(y_np.shape)

In [None]:
# Order: AFIB, GSVT, SB, SR 
encoder  = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y_np)
print(y_onehot.shape)

##### Save 500 Hz data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=10, stratify=y)

np.save('x_train_500.npy', x_train)
np.save('y_train.npy', y_train)
np.save('x_test_500.npy', x_test)
np.save('y_test.npy', y_test)

##### Create and Save 100 Hz data

In [None]:
pip install wfdb
from wfdb.processing import resample_sig


def downsample_wfdb(x):
  n = x.shape[0]
  data = []
  for i in range(n):
    sample = []
    for j in range(12):
      sample.append(resample_sig(x[i][j], 500, 100)[0])
    data.append(sample)
  return np.array(data)


x_train100 = downsample_wfdb(x_train)
x_test100  = downsample_wfdb(x_test)
print('x_train:', x_train100.shape, 'x_test:', x_test100.shape)

np.save('x_train_100.npy', x_train100)
np.save('x_test_100.npy', x_test100)