# MIT-BIH-AFIB-02-sequences.ipynb
Notebook used to identify and parse waveforms from the MIT-BIH Atrial Fibrillation Database.  
Creates rhythm and sequence CSV files for each rhythm type.  
See https://physionet.org/content/afdb/1.0.0/     

In [1]:
# Environment setup.
import os
import sys
import glob
import platform
import numpy as np

import fileutils as fu
import mit_bih_afib_db as db
import wfdb_utils as wfu

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

os_name = platform.system()
print('OS name:            {} {}'.format(os_name, platform.release()))
print('Python version:     {}.{}.{}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))
print('Numpy version:      {}'.format(np.__version__))
print('You are here: {}'.format(os.getcwd()))

OS name:            Windows 10
Python version:     3.10.10
Numpy version:      1.23.2
You are here: D:\dev\jupyter\deep-cnn-embedded\src\mit-bih-afib


In [2]:
db.LOCAL_DATA_PATH = r'E:\Data\MIT-BIH-AFIB\files'
db.LOCAL_TFRECORD_PATH = r'E:\Data\MIT-BIH-AFIB\tfrecord'

In [3]:
# Function to get ECG header and annotation data.
def get_data(filename, start=0, length=None):
    ann = wfu.read_annotation(filename, start, length)
    hdr = wfu.read_header(filename)
    return ann, hdr

In [4]:
# Function to count lines in a CSV file.
def line_count(file_name):
    count = 0
    fd = fu.open_file(file_name)
    for line in fd:
        if (len(line.strip()) > 0):
            count += 1
    fu.close_file(fd)
    return count

### Create CSV files of each rhythm type  
CSV line format: PID,rtype,start,length  

In [5]:
# Create the rhythm CSV files.
rhythm_fd = {}
for rtype in db.RTYPES:
    file_name = db.get_rhythms_file(rtype)
    fd = fu.open_file(file_name, 'w')
    rhythm_fd[rtype] = fd

In [7]:
# Parse the database and add entries to the rhythm CSV files.
for pid in db.PATIENT_IDS:
    print('{} '.format(pid), end='')
    file_path = os.path.join(db.LOCAL_DATA_PATH, pid)
    ann, hdr = get_data(file_path)
    rhythm_list = wfu.parse_waveforms(ann, hdr)
    for rhythm in rhythm_list:
        rtype = rhythm[0]
        csv_record = db.get_csv_record(pid, rtype, rhythm[1], rhythm[2])
        rhythm_fd[rtype].write(csv_record)
print()

04015 04043 04048 04126 04746 04908 04936 05091 05121 05261 06426 06453 06995 07162 07859 07879 07910 08215 08219 08378 08405 08434 08455 


In [8]:
# Close the rhythm files.
for rtype in db.RTYPES:
    fu.close_file(rhythm_fd[rtype])

### Split the rhythms into sequences of a specified length  

In [9]:
seq_length_sec = 30  # Sequence length in seconds
seq_samples = seq_length_sec * db.FS_HZ

In [10]:
# Function to split a rhythm record into a set of sequences of a defined length.
def get_sequences(record, seq_length):
    seq_list = []
    pid, rtype, start, length = record.split(',')
    seq_length = int(seq_length)
    start = int(start)
    length = int(length)
    while (length >= seq_length):
        seq = (pid, rtype, start, seq_length)
        seq_list.append(seq)
        start += seq_length
        length -= seq_length
    return seq_list

In [14]:
for rtype in db.RTYPES:
    # Create the sequence file.
    seq_file = db.get_sequences_file(rtype)
    seq_fd = fu.open_file(seq_file, 'w')
    
    # Open the corresponding rhythms file.
    rhythm_file = db.get_rhythms_file(rtype)
    rhythm_fd = fu.open_file(rhythm_file)
    
    # Split each line in the rhythms file into a set of sequences.
    for line in rhythm_fd:
        rhythm_record = line.strip()
        if (len(rhythm_record) > 0):
            seq_list = get_sequences(rhythm_record, seq_samples)
            # Write the sequences to the CSV file.
            for seq in seq_list:
                csv_record = db.get_csv_record(seq[0], seq[1], seq[2], seq[3])
                seq_fd.write(csv_record)
        
    fu.close_file(seq_fd)
    fu.close_file(rhythm_fd)

### Check the results

In [16]:
for rtype in db.RTYPES:
    seq_count = 0
    seq_file = db.get_sequences_file(rtype)
    seq_count = line_count(seq_file)
    print('{}: {}'.format(seq_file, seq_count))

E:\Data\MIT-BIH-AFIB\tfrecord\sequences_AFIB.csv: 11064
E:\Data\MIT-BIH-AFIB\tfrecord\sequences_AFL.csv: 190
E:\Data\MIT-BIH-AFIB\tfrecord\sequences_J.csv: 6
E:\Data\MIT-BIH-AFIB\tfrecord\sequences_N.csv: 16554
E:\Data\MIT-BIH-AFIB\tfrecord\sequences_Q.csv: 0


### Create ordered sequence CSV files

In [17]:
# Function to get the next PID record in the sequences list given the current PID.
def get_next_pid_record(seq_list, curr_pid):
    for i in range(len(seq_list)):
        seq = seq_list[i]
        pid, rtype, start, length = seq.split(',')
        if (pid > curr_pid):
            return (i, pid, rtype, start, length)
    # Did not find a non-matching PID.  Return the first sequence in the list.
    seq = seq_list[0]
    pid, rtype, start, length = seq.split(',')
    return (0, pid, rtype, start, length)

In [19]:
for rtype in db.RTYPES:
    print(rtype)
    # Create the ordered sequences file.
    ord_file = db.get_ordered_file(rtype)
    ord_fd = fu.open_file(ord_file, 'w')
    
    # Read the corresponding sequences file into memory.
    seq_file = db.get_sequences_file(rtype)
    seq_fd = fu.open_file(seq_file)
    seq_list = []
    for line in seq_fd:
        seq_line = line.strip()
        if (len(seq_line) > 0):
            seq_list.append(seq_line)
                
    # Get successive PID records and write them to the ordered sequences CSV file.
    pid = ''
    while (len(seq_list) > 0):
        (i, pid, rtype, start, length) = get_next_pid_record(seq_list, pid)
        csv_record = db.get_csv_record(pid, rtype, start, length)
        ord_fd.write(csv_record)
        seq_list.pop(i)
        
    fu.close_file(ord_fd)
    fu.close_file(seq_fd)

AFIB
AFL
J
N
Q


### Check the results

In [20]:
for rtype in db.RTYPES:
    ord_count = 0
    ord_file = db.get_ordered_file(rtype)
    ord_count = line_count(ord_file)
    print('{}: {}'.format(ord_file, ord_count))

E:\Data\MIT-BIH-AFIB\tfrecord\ordered_AFIB.csv: 11064
E:\Data\MIT-BIH-AFIB\tfrecord\ordered_AFL.csv: 190
E:\Data\MIT-BIH-AFIB\tfrecord\ordered_J.csv: 6
E:\Data\MIT-BIH-AFIB\tfrecord\ordered_N.csv: 16554
E:\Data\MIT-BIH-AFIB\tfrecord\ordered_Q.csv: 0
