# Icentia11k-04-ordered-sequences.ipynb
Create CSV files of uniformly ordered sequences for an even distribution of patients, segments and rhythms.  
Run Icentia11k-02-find-sequences.ipynb to create rhythm CSV files.  
Run Icentia11k-03-select-sequences.ipynb to create sequence CSV files from the rhythm CSV files.  
Run this notebook to create ordered versions of the CSV files per rhythm type from the sequence CSV files.  
See https://physionet.org/content/icentia11k-continuous-ecg/1.0/

### Environment setup

In [1]:
# Environment setup.
import glob
import os
import re
import sys
from time import localtime, strftime
import numpy as np

import fileutils as fu
import icentia11k as ic

In [2]:
# Global objects.
seq_file_pat = re.compile(ic.SEQUENCES_FILE_RE)

### Functions 

In [3]:
# Read a file into a memory list.
def load_file(file_name):
    file_list = []
    with fu.open_file(file_name) as fd:
        for line in fd:
            file_list.append(line)
    fu.close_file(fd)
    return file_list

# Parse a file line and return PID and SID as integers.
def get_pid_sid(line):
    (pid, sid, rtype, start, length) = line.strip().split(',')
    return int(pid[1:]), int(sid[1:])

### 1. Get a list of all sequence files

In [4]:
seq_file_list = []
for (dirpath, dirs, files) in os.walk(ic.LOCAL_DATA_PATH):
    for file in files:
        if seq_file_pat.match(file):
            seq_file_list.append(os.path.join(dirpath, file))

In [5]:
print('Total sequence files: {}'.format(len(seq_file_list)))
ic.pprint(seq_file_list[:8])

Total sequence files: 439
[ 'E:/Data/Icentia11k/data\\p00\\sequences_p00000_p00099_AFIB.csv',
  'E:/Data/Icentia11k/data\\p00\\sequences_p00000_p00099_AFL.csv',
  'E:/Data/Icentia11k/data\\p00\\sequences_p00000_p00099_N.csv',
  'E:/Data/Icentia11k/data\\p00\\sequences_p00000_p00099_Q.csv',
  'E:/Data/Icentia11k/data\\p00\\sequences_p00100_p00199_AFIB.csv',
  'E:/Data/Icentia11k/data\\p00\\sequences_p00100_p00199_AFL.csv',
  'E:/Data/Icentia11k/data\\p00\\sequences_p00100_p00199_N.csv',
  'E:/Data/Icentia11k/data\\p00\\sequences_p00100_p00199_Q.csv']


### 2. Create an ordered version of each sequence file  
#### Functions

In [4]:
# Get the next PID in the supplied list given the current PID and list index.
def next_pid_in_list(seq_list, index, curr_pid):
    list_len = len(seq_list)
    pid, sid = get_pid_sid(seq_list[index])
    while (pid <= curr_pid):
        index += 1
        if (index >= list_len):
            # Roll over to beginning of list.
            index = 0
            pid, sid = get_pid_sid(seq_list[index])
            break
        pid, sid = get_pid_sid(seq_list[index])
    return pid, sid, index

# Get the next SID in the supplied list for the current PID, SID and list index.
def next_sid_in_list(seq_list, index_in, curr_pid, curr_sid):
    list_len = len(seq_list)
    index = index_in
    found = False
    pid, sid = get_pid_sid(seq_list[index])
    while not found:
        if (pid == curr_pid):
            if (sid > curr_sid):
                found = True
            else:
                index += 1
                if (index < list_len):
                    # Get next PID, SID combination
                    pid, sid = get_pid_sid(seq_list[index])
                else:
                    # End of list. Roll over to beginning and exit.
                    index = index_in
                    pid, sid = get_pid_sid(seq_list[index])
                    found = True
        else:
            # Ran out of SIDs for this PID. Roll over to beginning and exit.
            index = index_in
            pid, sid = get_pid_sid(seq_list[index])
            found = True
    return pid, sid, index

# Create an ordered file from a single sequence file.
def order_file(seq_file):
    
    # Create the ordered file name from the sequences file name.
    seq_range = seq_file_pat.search(seq_file).group(1)
    ordered_dir = os.path.dirname(os.path.abspath(seq_file))
    ordered_file = os.path.join(ordered_dir, 'ordered_{}.csv'.format(seq_range))
    ofd = fu.open_file(ordered_file, 'w')

    max_pids = 100  # PIDs range from xxx00 to xxx99 in each sequences file
    sid_list = [-1 for i in range(max_pids)]
    seq_list = load_file(seq_file)
    
    # Get the first PID and SID.
    index = 0
    pid, sid = get_pid_sid(seq_list[index])
    mod_pid = pid % max_pids
    sid_list[mod_pid] = sid
    
    # Remove the first record from the list and write it to the file.
    rec = seq_list.pop(index)
    ofd.write(rec)
    
    # Loop until the list is empty.
    while (len(seq_list) > 0):
        # Get the next PID.
        pid, sid, index = next_pid_in_list(seq_list, index, pid)
        mod_pid = pid % max_pids
        
        # Try to get a new SID for this PID.
        if (sid <= sid_list[mod_pid]):
            pid, sid, index = next_sid_in_list(seq_list, index, pid, sid_list[mod_pid])
        sid_list[mod_pid] = sid
            
        # Remove the record form the list and write it to the file.
        rec = seq_list.pop(index)
        ofd.write(rec)
        
        # This will happen if we pop the record at the very end of the list.
        if (index >= len(seq_list)):
            index = len(seq_list) - 1
    
    fu.close_file(ofd)

#### Run this cell to create the ordered files for each sequence  
Note that this can take a very long time to run (10+ minutes each for large files).

In [8]:
start_index = 0  # Zero-based starting index into seq_file_list
# num_files = len(seq_file_list)
num_files = 1
i = 0
for i in range(start_index, start_index+num_files):
    seq_file = seq_file_list[i]
    print('{:3d}: {} {}'.format(i+1, ic.timestamp(), seq_file))
    order_file(seq_file)
print('{:3d}: {} Done.'.format(i+1, ic.timestamp()))

  1: 2023-06-16 08:27:22 D:\dev\jupyter\Icentia11k\data\p00\sequences_p00000_p00099_AFIB.csv
  1: 2023-06-16 08:27:54 Done.


#### Check the files created above

In [6]:
ordered_afl_file_pat = re.compile(r'ordered_p\d{5}_p\d{5}_AFL\.csv')
ordered_afib_file_pat = re.compile(r'ordered_p\d{5}_p\d{5}_AFIB\.csv')
ordered_n_file_pat = re.compile(r'ordered_p\d{5}_p\d{5}_N\.csv')
ordered_q_file_pat = re.compile(r'ordered_p\d{5}_p\d{5}_Q\.csv')

afib_file_count = 0
afl_file_count = 0
n_file_count = 0
q_file_count = 0

afib_line_count = 0
afl_line_count = 0
n_line_count = 0
q_line_count = 0

for sd in ic.SUBDIRS:
    files = glob.glob(os.path.join(ic.LOCAL_DATA_PATH, sd, r'ordered_p*.csv'))
    for file in files:
        
        if ordered_afl_file_pat.search(file):
            afl_file_count += 1
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        afl_line_count += 1
            fu.close_file(sfp)
            
        elif ordered_afib_file_pat.search(file):
            afib_file_count += 1
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        afib_line_count += 1
            fu.close_file(sfp)
        
        elif ordered_n_file_pat.search(file):
            n_file_count += 1
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        n_line_count += 1
            fu.close_file(sfp)
            
        elif ordered_q_file_pat.search(file):
            q_file_count += 1
            sequences_file = os.path.join(dirpath, file)
            with fu.open_file(sequences_file) as sfp:
                for line in sfp:
                    if (len(line.strip()) > 0):
                        q_line_count += 1
            fu.close_file(sfp)
print('Files: AFL: {}, AFIB: {}, N: {}, Q: {}'.format(afl_file_count, afib_file_count, n_file_count, q_file_count))
print('Lines: AFL: {}, AFIB: {}, N: {}, Q: {}'.format(afl_line_count, afib_line_count, n_line_count, q_line_count))

Files: AFL: 109, AFIB: 110, N: 110, Q: 110
Lines: AFL: 825775, AFIB: 2424476, N: 47187038, Q: 8208569


### 3. Consolidate the individual ordered files into one file per rhythym type per subdirectory  
Since AFL has the fewest sequences, we will construct it first and then build the others to match its size.  
#### Functions

In [5]:
# Function to get a list of sequences from an open ordered rhythm sequence file.
# Returns a set of sequences where each PID is unique in the set.
# Returns an empty list when the file is exhausted.
# Assumes that the PIDs are sorted in increasing order.
def get_next_sequences_from_file(fd):
    seq_list = []
    curr_pid = 0
    
    # Get first sequence.
    line = fd.readline()
    if (len(line.strip()) > 0):
        curr_pid, _ = get_pid_sid(line)
        seq_list.append(line)
    last_pid = curr_pid - 1  # Allow the loop to run at least once
    
    # Get remaining sequences.
    while (curr_pid > last_pid):
        last_pid = curr_pid
        loc_in = fd.tell()
        line = fd.readline()
        if (len(line.strip()) > 0):
            curr_pid, _ = get_pid_sid(line)
            if (curr_pid > last_pid):
                seq_list.append(line)
            else:
                fd.seek(loc_in)
        else:
            break  # End of file
    return seq_list

# Function to write a list of sequences to the ordered sequence file.
# Stop when the maximum number is reached (if specified).
# Returns the number of sequences written to the file.
def write_sequences_to_file(fd, seq_list, seq_max=-1):
    seq_count = 0
    try:
        for seq in seq_list:
            if (seq_max >= 0):
                if (seq_count < seq_max):
                    fd.write(seq)
                    seq_count += 1
                else:
                    break
            else:
                fd.write(seq)
                seq_count += 1
    except Exception as err:
        print('File write error: {}'.format(str(err)))
    return seq_count

In [6]:
# Function to create one consolidated ordered file in the subdirectory for the rhythm type.
# Ends when the number of sequences meets or exceeds seq_limit, or when all input files are exhausted.
# Returns the number of sequences in the file.
def create_ordered_subdir_file(sd, rtype, seq_limit=-1):    
    seq_count = 0
    ordered_fd_list = [None for i in range(10)]
    ordered_file_pat = re.compile(r'ordered_p\d{5}_p\d{5}_' + rtype.upper() + r'\.csv')
    
    subdir_path = os.path.join(ic.LOCAL_DATA_PATH, sd)
    files = os.listdir(subdir_path)
    
    # Open the individual ordered rhythm files created in Step 2.
    i = 0
    for file in files:
        if ordered_file_pat.match(file):
            ordered_fd_list[i] = fu.open_file(os.path.join(subdir_path, file))
            i += 1
    
    try:
        # Create the master ordered sequence file in this subdirectory for this rhythm type.
        ordered_subdir_rtype_file_name = '{}_{}_{}.csv'.format(ic.ORDERED_SUBDIR_BN, sd, rtype)
        ordered_subdir_rtype_file_path = os.path.join(subdir_path, ordered_subdir_rtype_file_name)
        print('Creating {}'.format(ordered_subdir_rtype_file_path))
        osrfd = fu.open_file(ordered_subdir_rtype_file_path, 'w')
    
        # Continuously loop through the individual ordered rhythm files until the end is reached.
        # Get a set of sequences from each and write them to the master ordered sequence file.
        seq_remain = True  # Allow loop to run at least once
        while seq_remain:
            seq_remain = False
            for fd in ordered_fd_list:
                if fd is not None:
                    seq_list = get_next_sequences_from_file(fd)  # Get list of sequences
                    seq_len = len(seq_list)
                    if (seq_len > 0):
                        # We have more sequences to write to the file.
                        # See if we have a sequence limit.
                        seq_max = -1
                        if (seq_limit >= 0):  
                            seq_max = max(seq_limit - seq_count, 0)
                        seq_count += write_sequences_to_file(osrfd, seq_list, seq_max) # Write them to the file
                        if (seq_limit >= 0):
                            if (seq_count < seq_limit):
                                seq_remain = True  # We did not hit our sequence limit
                        else:
                            seq_remain = True # No sequence limit in effect
    except Exception as err:
        print(str(err))
    
    # Close the master ordered sequence file.
    fu.close_file(osrfd)

    # Close the individual ordered rhythm files created in Step 2.
    for fd in ordered_fd_list:
        fu.close_file(fd)
    return seq_count

#### Construct one consolidated ordered rhythm type file per subdirectory  

In [7]:
for sd in ic.SUBDIRS:
    print('Subdir: {}'.format(sd))

    # Construct the AFL file first and use its size to limit the others.
    rtype = 'AFL'
    afl_seq_count = create_ordered_subdir_file(sd, rtype)
    print('{} seq count: {}'.format(rtype, afl_seq_count))

    # Construct the remaining rhythm files.
    for rtype in ['AFIB', 'N', 'Q']:
        seq_count= create_ordered_subdir_file(sd, rtype, seq_limit=afl_seq_count)
        print('{} seq count: {}'.format(rtype, seq_count))

Subdir: p00
Creating E:/Data/Icentia11k/data\p00\ordered_dir_p00_AFL.csv
AFL seq count: 56527
Creating E:/Data/Icentia11k/data\p00\ordered_dir_p00_AFIB.csv
AFIB seq count: 56527
Creating E:/Data/Icentia11k/data\p00\ordered_dir_p00_N.csv
N seq count: 56527
Creating E:/Data/Icentia11k/data\p00\ordered_dir_p00_Q.csv
Q seq count: 56527
Subdir: p01
Creating E:/Data/Icentia11k/data\p01\ordered_dir_p01_AFL.csv
AFL seq count: 72977
Creating E:/Data/Icentia11k/data\p01\ordered_dir_p01_AFIB.csv
AFIB seq count: 72977
Creating E:/Data/Icentia11k/data\p01\ordered_dir_p01_N.csv
N seq count: 72977
Creating E:/Data/Icentia11k/data\p01\ordered_dir_p01_Q.csv
Q seq count: 72977
Subdir: p02
Creating E:/Data/Icentia11k/data\p02\ordered_dir_p02_AFL.csv
AFL seq count: 76270
Creating E:/Data/Icentia11k/data\p02\ordered_dir_p02_AFIB.csv
AFIB seq count: 76270
Creating E:/Data/Icentia11k/data\p02\ordered_dir_p02_N.csv
N seq count: 76270
Creating E:/Data/Icentia11k/data\p02\ordered_dir_p02_Q.csv
Q seq count: 7627

### 4. Consolidate subdirectory files into one master ordered list per rhythm type  
#### Functions  ¶

In [8]:
def create_ordered_master_file(rtype):    
    ordered_subdir_fd_list = [None for sd in ic.SUBDIRS]
    ordered_master_file = '{}_{}.csv'.format(ic.ORDERED_MASTER_BN, rtype)
    
    # Open the subdir files for this rhythm type created in Step 3.
    i = 0
    for sd in ic.SUBDIRS:
        filename = '{}_{}_{}.csv'.format(ic.ORDERED_SUBDIR_BN, sd, rtype.upper())
        ordered_subdir_file = os.path.join(ic.LOCAL_DATA_PATH, sd, filename)
        ordered_subdir_fd_list[i] = fu.open_file(os.path.join(ordered_subdir_file))
        i += 1
        
    # Create the master file for this rhythm type.
    print('Creating {}'.format(ordered_master_file))
    omfd = fu.open_file(os.path.join(ic.LOCAL_DATA_PATH, ordered_master_file), 'w')
     
    try:
        # Continuously loop through the individual ordered rhythm files until the end is reached.
        # Get a set of sequences from each and write them to the master ordered sequence file.
        seq_count = 0
        seq_remain = True  # Allow loop to run at least once
        while seq_remain:
            seq_remain = False
            for fd in ordered_subdir_fd_list:
                if fd is not None:
                    seq_list = get_next_sequences_from_file(fd)  # Get list of sequences
                    seq_len = len(seq_list)
                    if (seq_len > 0):
                        # We have more sequences to write to the file.
                        seq_count += write_sequences_to_file(omfd, seq_list)
                        seq_remain = True
    except Exception as err:
        print(str(err))

    # Close the master file.
    fu.close_file(omfd)
    
    # Close the subdir files for this rhythm type created in Step 3.
    for fd in ordered_subdir_fd_list:
        fu.close_file(fd)
    return seq_count

In [9]:
for rtype in ic.RTYPES:
    seq_count = create_ordered_master_file(rtype)
    print('{} count: {}'.format(rtype, seq_count))

Creating ordered_master_AFIB.csv
AFIB count: 825775
Creating ordered_master_AFL.csv
AFL count: 825775
Creating ordered_master_N.csv
N count: 825775
Creating ordered_master_Q.csv
Q count: 825775


### 5. Analyze the master ordered lists

In [10]:
# Function to return a histogram of PIDs.
def get_pid_histogram(file, seq_max=0):
    seq_count = 0
    pid_histo = np.zeros(ic.NUM_PATIENTS, dtype=np.int32)
    with fu.open_file(file) as fd:
        for line in fd:
            if (len(line.strip()) > 0):
                pid = int(line[1:6])
                pid_histo[pid] += 1
                if (seq_max > 0):
                    seq_count += 1
                    if (seq_count >= seq_max):
                        break
    fu.close_file(fd)
    return pid_histo

def analyze_file(rtype, seq_max=0):
    master_file = '{}_{}.csv'.format(ic.ORDERED_MASTER_BN, rtype)
    master_spec = os.path.join(ic.LOCAL_DATA_PATH, master_file)
    histo = get_pid_histogram(master_spec, seq_max)
    histo_sum, histo_cnt, histo_min, histo_avg, histo_max = ic.get_histo_stats(histo)
    print('{:>4s}: Seq: {:6d} PIDs: {:5d}  min: {:3d}  avg: {:6.1f}  max: {:4d}'.format(
        rtype, histo_sum, histo_cnt, histo_min, histo_avg, histo_max))
    return histo_sum

In [11]:
for rtype in ic.RTYPES:
    seq = analyze_file(rtype)
    
while (seq > 20000):
    seq = seq // 2
    print()
    for rtype in ic.RTYPES:
        seq = analyze_file(rtype, seq)

AFIB: Seq: 825775 PIDs:   727  min:   1  avg: 1135.9  max: 2186
 AFL: Seq: 825775 PIDs:   494  min:   1  avg: 1671.6  max: 6738
   N: Seq: 825775 PIDs: 10258  min:   2  avg:   80.5  max:  120
   Q: Seq: 825775 PIDs: 10849  min:   1  avg:   76.1  max:  125

AFIB: Seq: 412887 PIDs:   727  min:   1  avg:  567.9  max:  689
 AFL: Seq: 412887 PIDs:   494  min:   1  avg:  835.8  max: 1700
   N: Seq: 412887 PIDs: 10258  min:   2  avg:   40.3  max:   41
   Q: Seq: 412887 PIDs: 10849  min:   1  avg:   38.1  max:   40

AFIB: Seq: 206443 PIDs:   727  min:   1  avg:  284.0  max:  326
 AFL: Seq: 206443 PIDs:   493  min:   1  avg:  418.7  max:  675
   N: Seq: 206443 PIDs: 10258  min:   2  avg:   20.1  max:   21
   Q: Seq: 206443 PIDs: 10849  min:   1  avg:   19.0  max:   20

AFIB: Seq: 103221 PIDs:   727  min:   1  avg:  142.0  max:  156
 AFL: Seq: 103221 PIDs:   493  min:   1  avg:  209.4  max:  288
   N: Seq: 103221 PIDs: 10258  min:   2  avg:   10.1  max:   11
   Q: Seq: 103221 PIDs: 10849  min:  

### 6. Compare to the original rhythm files

In [15]:
# Function to compute and print the Patient ID stats for all original rhythm files.
def get_rhythm_stats(desired_rtype):
    num_patients = 11000
    rhythms_file_pat = re.compile(ic.RHYTHMS_FILE_RE)
    pid_histo = np.zeros(num_patients, dtype=np.int32)
    
    for sd in ic.SUBDIRS:
        files = glob.glob(os.path.join(ic.LOCAL_DATA_PATH, sd, r'rhythms_p*.csv'))
        for file in files:
            if rhythms_file_pat.search(file):
                with fu.open_file(file) as fd:
                    for line in fd:
                        parsed_line = line.strip().split(',')
                        pid = int(parsed_line[0][1:])
                        rtype = parsed_line[2]
                        if (rtype == desired_rtype):
                            pid_histo[pid] += 1
                fu.close_file(fd)
    histo_sum, histo_cnt, histo_min, histo_avg, histo_max = ic.get_histo_stats(pid_histo)
    return (histo_sum, histo_cnt, histo_min, histo_avg, histo_max)

In [16]:
for rtype in ic.RTYPES:
    (histo_sum, histo_cnt, histo_min, histo_avg, histo_max) = get_rhythm_stats(rtype)
    print('{:>4s}: Seq: {:8d} PIDs: {:5d}  min: {:3d}  avg: {:6.1f}  max: {:4d}'.format(
        rtype, histo_sum, histo_cnt, histo_min, histo_avg, histo_max))  

AFIB: Seq:   342816 PIDs:   728  min:   1  avg:  470.9  max: 1353
 AFL: Seq:   120621 PIDs:   494  min:   1  avg:  244.2  max: 1300
   N: Seq:  6689763 PIDs: 10259  min:   1  avg:  652.1  max: 1318
   Q: Seq:  2714781 PIDs: 10850  min:   1  avg:  250.2  max: 1158
