In [15]:
import sys 
sys.path.append("./code")
from clinical_ts.timeseries_utils import *
from clinical_ts.ecg_utils import *
from pathlib import Path
import os
from os.path import isdir
import subprocess
import wget
import shutil
import pdb

# prepare data

In [2]:
target_fs=100
data_root=Path("./data/")
target_root=Path("./data")
if not isdir(data_root):
    os.makedirs(data_root)
if not isdir(target_root):
    os.makedirs(target_root)

In [18]:
def download(data_url, dataset_dir, flatten=True):
    filename = wget.download(data_url, out=str(data_root))
    shutil.unpack_archive(str(filename), dataset_dir)
    if flatten:
        source = str(dataset_dir/os.listdir(dataset_dir)[0])
        destination = str(dataset_dir)
        files = os.listdir(source)
        for file in files:
            file_name = os.path.join(source, file)
            shutil.move(file_name, destination)
        os.rmdir(source)
    os.remove(filename)

## Ribeiro 2020

Downloads and preprocesses the test set from Ribeiro et al 2020 (https://www.nature.com/articles/s41467-020-15432-4) https://doi.org/10.5281/zenodo.3625006

In [20]:
data_folder_ribeiro_test = data_root/"ribeiro2020_test"
target_folder_ribeiro_test = target_root/("ribeiro_fs"+str(target_fs))
ribeiro_test_url='https://zenodo.org/record/3765780/files/data.zip?download=1'

In [None]:
# download and unzip dataset 
download(ribeiro_test_url, data_folder_ribeiro_test)

In [None]:
df_ribeiro_test, lbl_itos_ribeiro_test,  mean_ribeiro_test, std_ribeiro_test = prepare_data_ribeiro_test(data_folder_ribeiro_test, target_fs=target_fs, channels=12, channel_stoi=channel_stoi_default, target_folder=target_folder_ribeiro_test)

In [None]:
#reformat everything as memmap for efficiency
reformat_as_memmap(df_ribeiro_test, target_folder_ribeiro_test/("memmap.npy"),data_folder=target_folder_ribeiro_test,delete_npys=True)

In [None]:
os.rmdir(data_folder_ribeiro_test)

## Zheng 2020

Downloads and preprocesses the dataset from Zheng et al 2020 (https://www.nature.com/articles/s41597-020-0386-x) https://figshare.com/collections/ChapmanECG/4560497/2 

In [7]:
data_folder_chapman = data_root/"chapman/"
target_folder_chapman = target_root/("chapman_fs"+str(target_fs))
chapman_url = 'https://figshare.com/ndownloader/files/15651326'

In [None]:
# download and unzip dataset 
download(chapman_url, data_folder_chapman, flatten=False)

In [9]:
condition = 'https://figshare.com/ndownloader/files/15651293'
rhythm = 'https://figshare.com/ndownloader/files/15651296'
diagnostic = 'https://figshare.com/ndownloader/files/15653771'
attributes = 'https://figshare.com/ndownloader/files/15653762'

In [10]:
for url in [condition, rhythm, diagnostic, attributes]:
    wget.download(url, out=str(data_folder_chapman))

100% [............................................................] 9960 / 9960

In [11]:
df_chapman, lbl_itos_chapman,  mean_chapman, std_chapman = prepare_data_chapman(data_folder_chapman, denoised=False, target_fs=target_fs, channels=12, channel_stoi=channel_stoi_default, target_folder=target_folder_chapman)

  0%|          | 0/10646 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

  0%|          | 0/91 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/127 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/152 [00:00<?, ?it/s]

  0%|          | 0/187 [00:00<?, ?it/s]

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/203 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/231 [00:00<?, ?it/s]

  0%|          | 0/325 [00:00<?, ?it/s]

  0%|          | 0/447 [00:00<?, ?it/s]

  0%|          | 0/468 [00:00<?, ?it/s]

  0%|          | 0/710 [00:00<?, ?it/s]

  0%|          | 0/622 [00:00<?, ?it/s]

  0%|          | 0/509 [00:00<?, ?it/s]

  0%|          | 0/643 [00:00<?, ?it/s]

  0%|          | 0/1366 [00:00<?, ?it/s]

  0%|          | 0/2200 [00:00<?, ?it/s]

In [None]:
#reformat everything as memmap for efficiency
reformat_as_memmap(df_chapman, target_folder_chapman/("memmap.npy"),data_folder=target_folder_chapman,delete_npys=True)

In [None]:
os.rmdir(data_folder_chapman)

## CinC2020 Challenge

This training set stems from the CinC Challenge 2020 https://physionetchallenges.org/2020/ 

In this case you have to manually download the datasets by clicking these links:

https://storage.cloud.google.com/physionet-challenge-2020-12-lead-ecg-public/PhysioNetChallenge2020_Training_CPSC.tar.gz

https://storage.cloud.google.com/physionet-challenge-2020-12-lead-ecg-public/PhysioNetChallenge2020_Training_2.tar.gz

https://storage.cloud.google.com/physionet-challenge-2020-12-lead-ecg-public/PhysioNetChallenge2020_Training_StPetersburg.tar.gz

https://storage.cloud.google.com/physionet-challenge-2020-12-lead-ecg-public/PhysioNetChallenge2020_Training_PTB.tar.gz

https://storage.googleapis.com/physionet-challenge-2020-12-lead-ecg-public/PhysioNetChallenge2020_Training_PTB-XL.tar.gz

https://storage.cloud.google.com/physionet-challenge-2020-12-lead-ecg-public/PhysioNetChallenge2020_Training_E.tar.gz




In [27]:
data_folder_cinc = data_root/"cinc2020/"
if not isdir(data_folder_cinc):
    os.makedirs(data_folder_cinc)
target_folder_cinc = target_root/("cinc_fs"+str(target_fs))

In [34]:
filenames = ['PhysioNetChallenge2020_Training_CPSC.tar.gz','PhysioNetChallenge2020_Training_2.tar.gz',
             'PhysioNetChallenge2020_Training_StPetersburg.tar.gz', 'PhysioNetChallenge2020_Training_PTB.tar.gz',
            'PhysioNetChallenge2020_Training_PTB-XL.tar.gz', 'PhysioNetChallenge2020_Training_E.tar.gz']

In [None]:
for fname in filenames:
    shutil.unpack_archive(fname, data_folder_cinc)

In [None]:
for fname in filenames:
    os.remove(fname)

In [None]:
df_cinc, lbl_itos_cinc,  mean_cinc, std_cinc = prepare_data_cinc(data_folder_cinc, target_fs=target_fs, channels=12, channel_stoi=channel_stoi_default, target_folder=target_folder_cinc)

In [None]:
#reformat everything as memmap for efficiency
reformat_as_memmap(df_cinc, target_folder_cinc/("memmap.npy"),data_folder=target_folder_cinc,delete_npys=True)


In [None]:
os.rmdir(data_folder_cinc)

## PTB-XL

Downloads and preprocesses the PTB-XL dataset (https://www.nature.com/articles/s41597-020-0495-6) https://physionet.org/content/ptb-xl/1.0.1/ 

In [26]:
data_folder_ptb_xl = data_root/"ptb_xl/"
target_folder_ptb_xl = target_root/("ptb_xl_fs"+str(target_fs))
ptb_xl_url='https://storage.googleapis.com/ptb-xl-1.0.1.physionet.org/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1.zip'

In [None]:
# download and unzip dataset 
download(ptb_xl_url, data_folder_ptb_xl)

In [None]:
df_ptb_xl, lbl_itos_ptb_xl,  mean_ptb_xl, std_ptb_xl = prepare_data_ptb_xl(data_folder_ptb_xl, min_cnt=0, target_fs=target_fs, channels=12, channel_stoi=channel_stoi_default, target_folder=target_folder_ptb_xl)

  0%|          | 0/21837 [00:00<?, ?it/s]

In [None]:
#reformat everything as memmap for efficiency
reformat_as_memmap(df_ptb_xl, target_folder_ptb_xl/("memmap.npy"),data_folder=target_folder_ptb_xl,delete_npys=True)

In [None]:
os.rmdir(data_folder_ptb_xl)

## SPH

Downloads and preprocesses the SPH dataset (https://www.nature.com/articles/s41597-022-01403-5)

In [12]:
data_folder_sph = data_root/"sph/"
target_folder_sph = target_root/("sph_fs"+str(target_fs))
sph_url='https://springernature.figshare.com/ndownloader/files/32630684'

In [None]:
# download and unzip dataset 
download(sph_url, data_folder_sph)

In [22]:
attributes = 'https://springernature.figshare.com/ndownloader/files/34793152'
diagnostic = 'https://springernature.figshare.com/ndownloader/files/32630954'

In [23]:
for url in [diagnostic, attributes]:
    wget.download(url, out=str(data_folder_sph))

100% [........................................................] 999219 / 999219

In [None]:
df_sph, lbl_itos_sph,  mean_sph, std_sph = prepare_data_sph(data_folder_sph, min_cnt=0, target_fs=target_fs, channels=12, channel_stoi=channel_stoi_default, target_folder=target_folder_sph)

In [None]:
#reformat everything as memmap for efficiency
reformat_as_memmap(df_sph, target_folder_sph/("memmap.npy"),data_folder=target_folder_sph,delete_npys=True)

In [None]:
os.rmdir(data_folder_sph)