# Icentia11k-05-download-sequences.ipynb
Download Icentia11k ECG sequences using the ordered_master_xxx.csv files.  
Run Icentia11k-04-ordered-sequences.ipynb and its prerequisites to create the CSV files.  
See https://physionet.org/content/icentia11k-continuous-ecg/1.0/

### Environment setup  

In [1]:
# Environment setup.
import os
import re
import sys
import numpy as np

import fileutils as fu
import icentia11k as ic
import icentia11k_wfdb_utils as wfu

print('You are here: {}'.format(os.getcwd()))

You are here: D:\dev\jupyter\deep-cnn-embedded\src\icentia11k


In [31]:
# Global objects
ordered_master_fd = {}  # Dictionary of ordered sequences master file descriptors

# Data file match patterns.
afl_dat_file_pat = re.compile('p(\d{5})_s\d{2}_AFL_\d{7}_\d{7}\.dat')
afib_dat_file_pat = re.compile('p(\d{5})_s\d{2}_AFIB_\d{7}_\d{7}\.dat')
n_dat_file_pat = re.compile('p(\d{5})_s\d{2}_N_\d{7}_\d{7}\.dat')
q_dat_file_pat = re.compile('p(\d{5})_s\d{2}_Q_\d{7}_\d{7}\.dat')

#### Functions to open and close files

In [32]:
# Function to open all ordered master sequence files. 
def open_all_master_files():
    global ordered_master_fd
    for rtype in ic.RTYPES:
        om_filename = os.path.join(
            ic.LOCAL_DATA_PATH,
            '{}_{}.csv'.format(ic.ORDERED_MASTER_BN, rtype))
        fd = fu.open_file(om_filename)
        if fd is not None:
            ordered_master_fd[rtype] = fd

# Function to close all ordered master sequence files. 
def close_all_master_files():
    global ordered_master_fd
    for rtype in ic.RTYPES:
        if rtype in ordered_master_fd:
            fu.close_file(ordered_master_fd[rtype])

#### Function to get sequence from Icentia11k database and write it locally

In [47]:
def get_sequence(patient_id, segment_id, rtype, start, length, stream):
    ann = wfu.read_annotation(patient_id, segment_id, start=start, length=length, stream=stream)
    if ann is None: return False
    ecg = wfu.read_ecg(patient_id, segment_id, start=start, length=length, stream=stream)
    if ecg is None: return False
    ecg_sum = np.sum(ecg.p_signal)
    ecg_std = np.std(ecg.p_signal)
    if (ecg_std < 1e-3) or np.isinf(ecg_std) or np.isnan(ecg_std):
        print('\n*** ECG std: {:0.4f}: {}'.format(
            ecg_std, 
            ic.get_wfdb_basename(patient_id, segment_id, rtype, start, length)))
        return False
    if np.isinf(ecg_sum):
        print('\n*** ECG contains +/-inf: {}'.format(
            ic.get_wfdb_basename(patient_id, segment_id, rtype, start, length)))
        return False
    if np.isnan(ecg_sum):
        print('\n*** ECG contains NaN: {}'.format(
            ic.get_wfdb_basename(patient_id, segment_id, rtype, start, length)))
        return False
    if not wfu.write_annotation(ann, patient_id, segment_id, rtype, start, length): return False
    if not wfu.write_ecg(ecg, patient_id, segment_id, rtype, start, length): return False
    return True

### Download ECG rhythm sequences  ¶
Set `batch_size` to the maximum number of sequences of each type to download.  
If `ignore_existing` is `True`, then ECG sequence files that already exist are skipped and <u>don't</u> count against the batch size.  Otherwise, ECG sequence files that already exist are skipped, but they <u>do</u> count against the batch size.  
Each sequence is checked for errors.  If an error is found, the sequence should be manually removed from its ordered_master_*.csv file.  

In [48]:
ignore_existing = True
batch_size = 20000
stream = True
seq_counts = {'AFIB':0, 'AFL':0, 'N':0, 'Q':0}
download_count = 0
total_count = 0
target_count = 4 * batch_size

print('Target count: {}'.format(target_count - sum(seq_counts.values())))

open_all_master_files()

while (sum(seq_counts.values()) < target_count):
    for rtype in ic.RTYPES:
        if (seq_counts[rtype] < batch_size):
            sline = ordered_master_fd[rtype].readline().strip()
            if (len(sline) == 0):
                continue  # End of file reached
            slist = sline.split(',')
            patient_id = int(slist[0][1:])
            segment_id = int(slist[1][1:])
            start = int(slist[3])
            length = int(slist[4])

            path, file = wfu.get_local_filename(patient_id, segment_id, rtype, start, length)
            dat_file = os.path.join(path, '{}.dat'.format(file))
            if not os.path.isfile(dat_file):
                if get_sequence(patient_id, segment_id, rtype, start, length, stream):
                    download_count += 1
                    seq_counts[rtype] += 1
                    total_count += 1
                    if ((total_count % 100) == 0):
                        print('{} '.format(total_count), end='')  # Progress indicator
            else:
                if not ignore_existing:
                    seq_counts[rtype] += 1
                    total_count += 1
                    if ((total_count % 100) == 0):
                        print('{} '.format(total_count), end='')  # Progress indicator

close_all_master_files()
print(total_count)
print('Total downloaded: {}'.format(download_count))

Target count: 80000
100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13700 13800 13900 14000 14100 14200 14300 14400 14500 14600 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15800 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16900 17000 17100 17200 17300 17400 17500 17600 17700 17800 17900 18000 18100 18

In [49]:
# Close master files here in case above cell encounters an exception.
close_all_master_files()

### Analysis  
#### Run this cell to count ECG .dat files  

In [50]:
afl_count = 0
afib_count = 0
n_count = 0
q_count = 0

for (dirpath, dirs, files) in os.walk(ic.LOCAL_DATA_PATH):
    for file in files:
        if afl_dat_file_pat.match(file):
            afl_count += 1
        elif afib_dat_file_pat.match(file):
            afib_count += 1
        elif n_dat_file_pat.match(file):
            n_count += 1
        elif q_dat_file_pat.match(file):
            q_count += 1

print('AFL:   {}'.format(afl_count))
print('AFIB:  {}'.format(afib_count))
print('N:     {}'.format(n_count))
print('Q:     {}'.format(q_count))
print('Total: {}'.format(afl_count + afib_count + n_count + q_count))

AFL:   150000
AFIB:  150000
N:     150000
Q:     150000
Total: 600000


#### Run this cell to compute statistics on patient IDs

In [51]:
pid_array = np.zeros(ic.NUM_PATIENTS, dtype=np.int32)
afib_array = np.zeros(ic.NUM_PATIENTS, dtype=np.int32)
afl_array = np.zeros(ic.NUM_PATIENTS, dtype=np.int32)
n_array = np.zeros(ic.NUM_PATIENTS, dtype=np.int32)
q_array = np.zeros(ic.NUM_PATIENTS, dtype=np.int32)

for (dirpath, dirs, files) in os.walk(ic.LOCAL_DATA_PATH):
    for file in files:
        if afib_dat_file_pat.match(file):
            pid = int(afib_dat_file_pat.match(file).group(1))
            pid_array[pid] += 1
            afib_array[pid] += 1
        elif afl_dat_file_pat.match(file):
            pid = int(afl_dat_file_pat.match(file).group(1))
            pid_array[pid] += 1
            afl_array[pid] += 1
        elif n_dat_file_pat.match(file):
            pid = int(n_dat_file_pat.match(file).group(1))
            pid_array[pid] += 1
            n_array[pid] += 1
        elif q_dat_file_pat.match(file):
            pid = int(q_dat_file_pat.match(file).group(1))
            pid_array[pid] += 1
            q_array[pid] += 1
            
histo_sum, histo_cnt, histo_min, histo_avg, histo_max = ic.get_histo_stats(pid_array)
print(' ALL: Seq: {:6d} PIDs: {:5d}  min: {:3d}  avg: {:6.1f}  max: {:4d}'.format(
        histo_sum, histo_cnt, histo_min, histo_avg, histo_max))
histo_sum, histo_cnt, histo_min, histo_avg, histo_max = ic.get_histo_stats(afib_array)
print('AFIB: Seq: {:6d} PIDs: {:5d}  min: {:3d}  avg: {:6.1f}  max: {:4d}'.format(
        histo_sum, histo_cnt, histo_min, histo_avg, histo_max))
histo_sum, histo_cnt, histo_min, histo_avg, histo_max = ic.get_histo_stats(afl_array)
print(' AFL: Seq: {:6d} PIDs: {:5d}  min: {:3d}  avg: {:6.1f}  max: {:4d}'.format(
        histo_sum, histo_cnt, histo_min, histo_avg, histo_max))
histo_sum, histo_cnt, histo_min, histo_avg, histo_max = ic.get_histo_stats(n_array)
print('   N: Seq: {:6d} PIDs: {:5d}  min: {:3d}  avg: {:6.1f}  max: {:4d}'.format(
        histo_sum, histo_cnt, histo_min, histo_avg, histo_max))
histo_sum, histo_cnt, histo_min, histo_avg, histo_max = ic.get_histo_stats(q_array)
print('   Q: Seq: {:6d} PIDs: {:5d}  min: {:3d}  avg: {:6.1f}  max: {:4d}'.format(
        histo_sum, histo_cnt, histo_min, histo_avg, histo_max))

 ALL: Seq: 600000 PIDs: 10894  min:   8  avg:   55.1  max: 5741
AFIB: Seq: 150000 PIDs:   727  min:   1  avg:  206.3  max: 4622
 AFL: Seq: 150000 PIDs:   494  min:   1  avg:  303.6  max: 5333
   N: Seq: 150000 PIDs: 10259  min:   2  avg:   14.6  max: 5333
   Q: Seq: 150000 PIDs: 10850  min:   1  avg:   13.8  max: 2031
