# Icentia11k-03-select-sequences.ipynb
Analyze the Icentia11k rhythm CSV files and create a set of CSV files that identify specific sequence examples.  
Run Icentia11k-02-find-sequences.ipynb to create rhythm CSV files, then run this notebook to create sequence CSV files.  
See https://physionet.org/content/icentia11k-continuous-ecg/1.0/  

### Environment setup

In [1]:
import glob
import os
import re
import sys

import fileutils as fu
import icentia11k as ic

In [2]:
# Global objects.
seq_sec           = 30  # Sequence length in seconds
seq_length        = ic.num_samples(seq_sec)  # Sequence length in samples
seq_leader_sec    = 5   # Sequence leader length in seconds - used for filtering
seq_leader_length = ic.num_samples(seq_leader_sec) # Sequence leader length in samples

rhythms_file_pat = re.compile(ic.RHYTHMS_FILE_RE)

#### Functions  

In [3]:
# Parse a line from the rhythms CSV file into zero or more examples matching the desired lengths.
# Returns a list of (pid, sid, rtype, start, length) tuples of the desired length.
def parse_rhythm(rhythm_list, seq_leader_length, seq_length):
    example_list = []
    (pid, sid, rtype, start, length) = rhythm_list
    i_start = int(start)
    i_length = int(length)
    i_remain = i_length
    total_length = seq_leader_length + seq_length
    while (i_remain >= seq_length):
        
        if (i_start >= seq_leader_length):
            # Case 1: There is sufficient leader before the start of the rhythm sequence.
            #print('Case 1')
            this_start = i_start - seq_leader_length
            i_remain -= seq_length
            i_start += seq_length
            example = (pid, sid, rtype, this_start, total_length)
            example_list.append(example)
            #ic.pprint(example)
            
        elif ((i_start + i_length) >= total_length):
            # Case 2: There is insufficient leader before the start of the rhythm sequence, but
            # there is enough rhythm sequence for the leader to encroach into the start of it.
            #print('Case 2')
            this_start = 0
            i_remain -= (total_length - i_start)
            i_start = total_length
            example = (pid, sid, rtype, this_start, total_length)
            example_list.append(example)
            #ic.pprint(example)
            
        else:
            # Not enough ECG to work with.
            #print('Case 3')
            i_remain = 0
        
    return example_list

In [4]:
# Add a list of rhythm sequences to a CSV file.
# Data is appended to the end of the file.
def add_sequences_to_file(sequences_list, sequences_file):
    with fu.open_file(sequences_file, 'a') as fd:
        for seq_tuple in sequences_list:
            (pid, sid, rtype, start, length) = seq_tuple
            rec = '{},{},{},{},{}\n'.format(pid, sid, rtype, start, length)
            fd.write(rec)
    fd.close()

### Run this cell to create the sequences CSV files
Each time this cell is run, it appends to existing files.

In [9]:
for sd in ic.SUBDIRS:
    local_dir = os.path.join(ic.LOCAL_DATA_PATH, sd)
    rhythm_file_list = sorted(glob.glob(os.path.join(local_dir, 'rhythms*.csv')))

    for rhythm_file in rhythm_file_list:
        print(rhythm_file)
    
        pid_range = rhythms_file_pat.search(rhythm_file).group(1)
        rfd = fu.open_file(rhythm_file)
        for line in rfd:
    
            # Split the line in the rhythms file into a list of individual parts.
            rhythm_list = line.strip().split(',')
    
            sequences_list = parse_rhythm(rhythm_list, seq_leader_length, seq_length)
            sequences_file = 'sequences_{}_{}.csv'.format(pid_range, rhythm_list[2].upper())
            sequences_path = os.path.join(local_dir, sequences_file)
    
            add_sequences_to_file(sequences_list, sequences_path)
    
        fu.close_file(rfd)

D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00000_p00099.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00100_p00199.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00200_p00299.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00300_p00399.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00400_p00499.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00500_p00599.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00600_p00699.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00700_p00799.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00800_p00899.csv
D:\dev\jupyter\Icentia11k\data\p00\rhythms_p00900_p00999.csv
D:\dev\jupyter\Icentia11k\data\p01\rhythms_p01000_p01099.csv
D:\dev\jupyter\Icentia11k\data\p01\rhythms_p01100_p01199.csv
D:\dev\jupyter\Icentia11k\data\p01\rhythms_p01200_p01299.csv
D:\dev\jupyter\Icentia11k\data\p01\rhythms_p01300_p01399.csv
D:\dev\jupyter\Icentia11k\data\p01\rhythms_p01400_p01499.csv
D:\dev\jupyter\Icentia11k\data\p01\rhythms_p01500_p01599.csv
D:\dev\jupyter\Icentia11

### Check the files created above

In [5]:
afl_file_pat = re.compile('sequences_p\d{5}_p\d{5}_AFL\.csv')
afib_file_pat = re.compile('sequences_p\d{5}_p\d{5}_AFIB\.csv')
n_file_pat = re.compile('sequences_p\d{5}_p\d{5}_N\.csv')
q_file_pat = re.compile('sequences_p\d{5}_p\d{5}_Q\.csv')

afib_count = 0
afl_count = 0
n_count = 0
q_count = 0

for (dirpath, dirs, files) in os.walk(ic.LOCAL_DATA_PATH):
    for file in files:
        
        if afl_file_pat.match(file):
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        afl_count += 1
            fu.close_file(sfp)
            
        elif afib_file_pat.match(file):
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        afib_count += 1
            fu.close_file(sfp)
        
        elif n_file_pat.match(file):
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        n_count += 1
            fu.close_file(sfp)
            
        elif q_file_pat.match(file):
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        q_count += 1
            fu.close_file(sfp)
print('AFL: {}, AFIB: {}, N: {}, Q: {}'.format(afl_count, afib_count, n_count, q_count))

AFL: 825775, AFIB: 2424476, N: 47187038, Q: 8208588
