In [1]:
import sys
sys.path.append('C:/Users/lumin/Desktop/Work/20212/source-code/python-classifier-2022')

from helper_code import *
from os.path import dirname, basename, join

%load_ext autoreload
%autoreload 2

In [15]:

from tqdm.notebook import tqdm
from scipy.io import savemat

def create_annotation_for_segmentation(rec_file_name):
    """
    Generate annotation file to serve as input for logistic regression - HSMM segmentation
    
    Arguments:
    - rec_file_name: the path to the wav file
    Returns:
    - A 2-D array: 
        + first column contains the indices of R peak (start of S1)
        + second column contains the indices of the end of T wave (end of S2)
    """

    data_folder = dirname(rec_file_name)
    recording_name = basename(rec_file_name).replace('.wav', '')
    seg_filename = join(data_folder, recording_name + '.tsv')

    R_indices = []
    T_indices = []

    with open(seg_filename, 'r') as f:
        data = f.read()
        for l in data.split('\n'):
            sound_type = l.split('\t')[-1]
            if sound_type == '1':
                R_indices.append(round(float(l.split('\t')[0]) * NEW_SAMPLING_RATE, 0))
            elif sound_type == '3':
                T_indices.append(round(float(l.split('\t')[0]) * NEW_SAMPLING_RATE, 0))

    if len(R_indices) != 0:
        R_indices = np.vstack(R_indices)
    if len(T_indices) != 0:
        T_indices = np.vstack(T_indices)
        
    # return np.ndarray([R_indices, T_indices], dtype='object')
    return [R_indices, T_indices]


# print(create_annotation_for_segmentation("C:/Users/lumin/Desktop/Work/20212/Data/circor-heart-sound/final/train/2530_AV.wav"))

def build_mat_data(folder_path):
    patient_files = find_patient_files(folder_path)

    # audio_recordings = np.ndarray([], dtype='object')
    # annotations = np.ndarray([], dtype='object')
    audio_recordings = list()
    annotations = list()

    for current_patient in tqdm(patient_files, desc='Patients', position=0):
        current_patient_data = load_patient_data(current_patient)
        rec_files, current_recordings = load_recordings(folder_path, current_patient_data, get_name=True)

        for i in range(len(current_recordings)):
            audio_recordings.append(np.vstack(current_recordings[i]))
            # np.append(audio_recordings, np.vstack(current_recordings[j]))
            
            current_annotation = create_annotation_for_segmentation(rec_files[i])
            annotations.append(current_annotation)
            # np.append(annotations, current_annotation)
            # annotations = np.vstack((annotations, current_annotation))

    data_type = basename(folder_path)
    output_mat_file = join(output_mat_folder, f'{data_type}.mat')

    print(output_mat_file)

    savemat(output_mat_file, {'PCGCellArray' : audio_recordings, 
                                'annotationsArray' : annotations,
                                'Fs' : float(NEW_SAMPLING_RATE)})

In [16]:
DATA_PATH = 'C:/Users/lumin/Desktop/Work/20212/Data/circor-heart-sound'
output_mat_folder = "C:/Users/lumin/Desktop/Work/20212/source-code/physionet.org/files/hss/1.0"

train_data_folder = join(DATA_PATH, 'final', 'train')
test_data_folder = join(DATA_PATH, 'final', 'test')

In [18]:
build_mat_data(train_data_folder)

print("foo")

Patients:   0%|          | 0/743 [00:00<?, ?it/s]

C:/Users/lumin/Desktop/Work/20212/source-code/physionet.org/files/hss/1.0\train.mat
foo


In [17]:
build_mat_data(test_data_folder)

Patients:   0%|          | 0/199 [00:00<?, ?it/s]

C:/Users/lumin/Desktop/Work/20212/source-code/physionet.org/files/hss/1.0\test.mat


  narr = np.asanyarray(source)
