In [2]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from collections import Counter
import matplotlib.pyplot as plt
import os
import wfdb
from sklearn.model_selection import train_test_split

In [1]:
import random

In [3]:
ecg_data = os.path.join(os.getcwd(), 'processed_ecg_signals_2/WFDBRecords')




hea_files = []
mat_files = []

for root, dirs, files in os.walk(ecg_data):
    for file in files:
        if file.endswith('.hea'):
            hea_files.append(os.path.join(root, file))
        elif file.endswith('.mat'):
            mat_files.append(os.path.join(root, file))


print('\nTotal HEA Files:', len(hea_files))
print('Total MAT Files:', len(mat_files))


Total HEA Files: 45152
Total MAT Files: 45152


In [4]:
records_list = []
conditions_list = []

mapping_file = os.path.join(os.getcwd(), 'processed_ecg_signals_2/ConditionNames_SNOMED-CT.csv')
mapping_df = pd.read_csv(mapping_file)
mapping_dict = dict(zip(mapping_df['Snomed_CT'], mapping_df['Full Name']))



for hea_file, mat_file in zip(hea_files, mat_files):
    base_name_hea = os.path.splitext(os.path.basename(hea_file))[0]
    
    header_path = os.path.join(os.path.dirname(hea_file), base_name_hea)

    base_name_mat = os.path.splitext(os.path.basename(mat_file))[0]
    
    mat_path = os.path.join(os.path.dirname(mat_file), base_name_mat)
    
    try:
        record = wfdb.rdheader(header_path)

        mat_data = loadmat(mat_path)

        data_matrix = mat_data['my_matrix']




        #extracting comments
        comments = record.comments[2].split(':')
        b = comments[1]
        b = b.split(",")
        diagnosis_code = []
        for x in b:
            diagnosis_code.append(int(x))

        diagnosis_descriptions = [mapping_dict.get(x, 'Normal') for x in diagnosis_code]

        
        records_list.append((record, data_matrix))
        conditions_list.append(diagnosis_descriptions)
       
    except FileNotFoundError:
        print(f'File not found: {hea_file}')
    except ValueError as e:
        print(f"Error downloading {record}: {e}")
    except KeyError as k:
        print(f"Key error: {record}: {k}")


print(f"Total records: {len(records_list)}")
print(f"Total conditions: {len(conditions_list)}")

print("\nSample record and its conditions:")
print(records_list[0])  
print(conditions_list[0])  

Error downloading <wfdb.io.record.Record object at 0x16c54ef10>: time data '/' does not match format '%d/%m/%Y'
Key error: <wfdb.io.record.Record object at 0x16c5c08d0>: 'my_matrix'
Key error: <wfdb.io.record.Record object at 0x16e79a4d0>: 'my_matrix'
Total records: 45149
Total conditions: 45149

Sample record and its conditions:
(<wfdb.io.record.Record object at 0x1539c11d0>, array([[ 0.01649026,  0.01438088,  0.01483756, ...,  0.0456498 ,
         0.04935673,  0.04861371],
       [ 0.07197136,  0.067756  ,  0.07067435, ..., -0.00121251,
         0.00295649,  0.00157854],
       [ 0.05549295,  0.05332219,  0.05564157, ..., -0.04664117,
        -0.04634532, -0.04700355],
       ...,
       [ 0.15332092,  0.16640412,  0.19245953, ..., -0.05005094,
        -0.05204563, -0.0584688 ],
       [ 0.14082128,  0.14270729,  0.1571812 , ..., -0.04425058,
        -0.04579298, -0.05182821],
       [ 0.04846209,  0.03823981,  0.03815351, ..., -0.01340621,
        -0.01379788, -0.01834516]]))
['Atri

In [13]:
filtered_conditions = []

for conditions in conditions_list:
    if len(conditions) > 1:
        selected_condition = random.choice(conditions)
    else:
        selected_condition = conditions[0]
    filtered_conditions.append(selected_condition)

condition_counts = Counter(filtered_conditions)
print(condition_counts)

conditions_to_remove = {condition for condition, count in condition_counts.items() if count == 1}

filtered_conditions_final = []
filtered_records = []


for record, condition in zip(records_list, filtered_conditions):
    if condition not in conditions_to_remove:
        filtered_records.append(record)
        filtered_conditions_final.append(condition)

condition_counts_2 = Counter(filtered_conditions_final)

print(condition_counts_2)


train_records, test_records, train_conditions, test_conditions = train_test_split(
    filtered_records, 
    filtered_conditions_final, 
    test_size=0.2, 
    stratify=filtered_conditions_final, 
    random_state=42
)


Counter({'Sinus Bradycardia': 12119, 'Sinus Rhythm': 6837, 'Normal': 5999, 'Sinus Tachycardia': 4809, 'Atrial Flutter': 3977, 'T wave Change': 2467, 'Sinus Irregularity': 1769, 'Atrial Fibrillation': 966, 'T wave opposite': 702, 'Supraventricular Tachycardia': 539, 'Axis left shift': 492, 'atrial\xa0premature\xa0beats': 416, 'ST-T Change': 404, 'lower voltage QRS in all lead': 388, '1 degree atrioventricular block': 388, 'ST drop down': 388, 'Axis right shift': 297, 'abnormal Q wave': 280, 'right bundle branch block': 236, 'Intraventricular block': 235, 'ST extension': 215, 'left ventricle hypertrophy': 163, 'Early repolarization of the ventricles': 130, 'Atrial Tachycardia': 116, 'QT interval extension': 100, 'ventricular premature beat': 91, 'atrioventricular block': 91, 'left front bundle branch block': 80, 'countercolockwise rotation': 55, 'ST tilt up': 55, 'P wave Change': 38, 'U wave': 37, 'WPW': 36, 'Myocardial infraction in the side wall': 30, 'ventricular fusion wave': 29, 'ri

In [14]:
# Check the number of records in each set
print(f"Number of training records: {len(train_records)}")
print(f"Number of test records: {len(test_records)}")

# Check the distribution of conditions in training and test sets
train_condition_counts = Counter(train_conditions)
test_condition_counts = Counter(test_conditions)

print("Training conditions distribution:")
for condition, count in train_condition_counts.items():
    print(f"{condition}: {count}")

print("\nTest conditions distribution:")
for condition, count in test_condition_counts.items():
    print(f"{condition}: {count}")

# Optionally, you can also verify the overall distribution to check consistency
print("\nOverall filtered conditions distribution:")
filtered_conditions_final_counts = Counter(filtered_conditions_final)
for condition, count in filtered_conditions_final_counts.items():
    print(f"{condition}: {count}")

Number of training records: 36118
Number of test records: 9030
Training conditions distribution:
Atrial Flutter: 3181
Sinus Bradycardia: 9695
Sinus Tachycardia: 3847
Sinus Rhythm: 5469
Normal: 4799
atrioventricular block: 73
T wave Change: 1974
Sinus Irregularity: 1415
T wave opposite: 562
ventricular fusion wave: 23
Myocardial infraction in the side wall: 24
junctional escape beat: 14
Intraventricular block: 188
atrial premature beats: 333
Supraventricular Tachycardia: 431
2 degree atrioventricular block: 15
right bundle branch block: 189
lower voltage QRS in all lead: 310
1 degree atrioventricular block: 310
ST extension: 172
ST-T Change: 323
Atrial Fibrillation: 773
abnormal Q wave: 224
Axis right shift: 238
ST drop down: 310
ventricular premature beat: 73
Early repolarization of the ventricles: 104
P wave Change: 30
right atrial hypertrophy: 11
Axis left shift: 394
ST tilt up: 44
left front bundle branch block: 64
QT interval extension: 80
Atrial Tachycardia: 93
left ventricle hype

In [15]:
import scipy.io

In [29]:


for hea, mat in test_records:
    base_name = hea.record_name
    hea_file = os.path.join(base_name + '.hea')
    mat_file = os.path.join(base_name + '.mat')

    output_path = os.path.join('test_data', base_name + '.mat')  # Save in original directory
    scipy.io.savemat(output_path, {'my_matrix': mat})  # Save the modified file

    #scipy.io.savemat(mat_file, {'my_matrix': mat})


print("Export completed!")

Export completed!


In [28]:


for hea, mat in train_records:
    base_name = hea.record_name
    hea_file = os.path.join(base_name + '.hea')
    mat_file = os.path.join(base_name + '.mat')

    output_path = os.path.join('train_test_data', base_name + '.mat')  # Save in original directory
    scipy.io.savemat(output_path, {'my_matrix': mat})  # Save the modified file

    #scipy.io.savemat(mat_file, {'my_matrix': mat})


print("Export completed!")

Export completed!
