In [2]:
import pandas as pd
import numpy as np
import wfdb
import ast
import os
import pickle
from tqdm import tqdm

raw_data_path = 'E:/pv/WORKING/ECG_main_folder/ECG_Classification_MI_detect/data/raw_data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/'
save_path = 'E:/pv/WORKING/ECG_main_folder/ECG_Classification_MI_detect/data/loaded_data_MI_subclass/'
os.makedirs(save_path, exist_ok=True)
sampling_rate = 100

# Các subclass MI cần phân loại
mi_subclasses = {
    'IMI': 'Inferior MI',
    'AMI': 'Anterior MI',
    'LMI': 'Lateral MI',
    'PMI': 'Posterior MI'
}

# Tạo thư mục cho mỗi subclass MI và NORM
for subclass in mi_subclasses.keys():
    os.makedirs(os.path.join(save_path, subclass), exist_ok=True)

In [4]:
# Đọc label và map subclass MI 
def aggregate_diagnostic(y_dict, agg_df, weight_threshold=80):
    tmp = []
    for key, value in y_dict.items():
        if value >= weight_threshold and key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_subclass)
    return list(set(tmp))

# Load metadata
Y = pd.read_csv(os.path.join(raw_data_path, 'ptbxl_database.csv'), index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

agg_df = pd.read_csv(os.path.join(raw_data_path, 'scp_statements.csv'), index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

Y['diagnostic_subclass'] = Y.scp_codes.apply(lambda x: aggregate_diagnostic(x, agg_df))
Y

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_subclass
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr,[NORM]
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr,[NORM]
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr,[NORM]
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr,[NORM]
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr,[NORM]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21833,17180.0,67.0,1,,,1.0,2.0,AT-60 3,2001-05-31 09:14:35,ventrikulÄre extrasystole(n) sinustachykardie ...,...,,", alles,",,,1ES,,7,records100/21000/21833_lr,records500/21000/21833_hr,[STTC]
21834,20703.0,300.0,0,,,1.0,2.0,AT-60 3,2001-06-05 11:33:39,sinusrhythmus lagetyp normal qrs(t) abnorm ...,...,,,,,,,4,records100/21000/21834_lr,records500/21000/21834_hr,[NORM]
21835,19311.0,59.0,1,,,1.0,2.0,AT-60 3,2001-06-08 10:30:27,sinusrhythmus lagetyp normal t abnorm in anter...,...,,", I-AVR,",,,,,2,records100/21000/21835_lr,records500/21000/21835_hr,[]
21836,8873.0,64.0,1,,,1.0,2.0,AT-60 3,2001-06-09 18:21:49,supraventrikulÄre extrasystole(n) sinusrhythmu...,...,,,,,SVES,,8,records100/21000/21836_lr,records500/21000/21836_hr,[NORM]


In [6]:
# Remove rows without label
Y = Y[Y['diagnostic_subclass'].apply(lambda x: len(x) > 0)].copy()

# Loại mixed NORM với bệnh
def is_mixed_with_norm(labels):
    return 'NORM' in labels and any(label != 'NORM' for label in labels)

Y = Y[~Y['diagnostic_subclass'].apply(is_mixed_with_norm)].copy()

# Chỉ giữ lại các record chứa MI hoặc NORM
def keep_only_mi(labels):
    if 'IMI' in labels:
        return 'IMI'
    elif 'AMI' in labels:
        return 'AMI'
    elif 'LMI' in labels:
        return 'LMI'
    elif 'PMI' in labels:
        return 'PMI'
    else:
        return None

Y['target'] = Y['diagnostic_subclass'].apply(keep_only_mi)
Y = Y[Y['target'].notnull()].copy()
Y

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_subclass,target
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
177,21551.0,73.0,0,,,,3.0,AT-6 C,1986-03-15 08:11:15,sinusrytm extrem vÄnster el-axel avvikande qrs...,...,,,,,,4,records100/00000/00177_lr,records500/00000/00177_hr,[AMI],AMI
181,21551.0,73.0,0,,,,3.0,AT-6 C,1986-03-16 08:17:31,ventrikulÄr(a) extrasystoli(er) fÖrmaksflimmer...,...,,,,,,4,records100/00000/00181_lr,records500/00000/00181_hr,[AMI],AMI
184,13112.0,74.0,0,,,,3.0,AT-6 C,1986-03-17 07:33:28,sinusrytm extrem vÄnster el-axel avvikande qrs...,...,", I-AVR,",,,,,9,records100/00000/00184_lr,records500/00000/00184_hr,"[AMI, IMI]",IMI
189,13112.0,74.0,0,,,,3.0,AT-6 C,1986-03-21 08:54:03,slag med avvikande impulsutbredning supraventr...,...,", alles,",,,,,9,records100/00000/00189_lr,records500/00000/00189_hr,"[AMI, IMI]",IMI
210,16062.0,58.0,0,,78.0,2.0,0.0,CS-12 E,1986-05-16 06:46:36,sinusrhythmus ueberdrehter linkstyp,...,", alles,",,,,,10,records100/00000/00210_lr,records500/00000/00210_hr,[IMI],IMI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21805,16291.0,72.0,0,,,1.0,2.0,AT-60 3,2001-02-13 13:32:09,sinusrhythmus p-sinistrocardiale ueberdrehter ...,...,,,,,,3,records100/21000/21805_lr,records500/21000/21805_hr,"[AMI, LAFB/LPFB, ISCA]",AMI
21815,14433.0,82.0,1,,,1.0,2.0,AT-60 3,2001-02-25 10:58:58,sinusrhythmus a-v block i ueberdrehter linksty...,...,", alles,",,,,,3,records100/21000/21815_lr,records500/21000/21815_hr,"[_AVB, IMI, LAFB/LPFB]",IMI
21826,9178.0,82.0,1,,,1.0,2.0,AT-60 3,2001-05-13 18:20:52,sinus arrhythmie ueberdrehter linkstyp linksan...,...,", I-AVF,",,,,,10,records100/21000/21826_lr,records500/21000/21826_hr,[IMI],IMI
21827,13862.0,79.0,1,,,1.0,2.0,AT-60 3,2001-05-20 16:30:53,"ventrikulÄre extrasystole(n), trigeminus supra...",...,,,,"4ES,SVES",,5,records100/21000/21827_lr,records500/21000/21827_hr,[IMI],IMI


In [8]:
# Cập nhật filepath
if sampling_rate == 100:
    Y['filepath'] = Y['filename_lr']
else:
    Y['filepath'] = Y['filename_hr']
Y

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_subclass,target,filepath
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
177,21551.0,73.0,0,,,,3.0,AT-6 C,1986-03-15 08:11:15,sinusrytm extrem vÄnster el-axel avvikande qrs...,...,,,,,4,records100/00000/00177_lr,records500/00000/00177_hr,[AMI],AMI,records100/00000/00177_lr
181,21551.0,73.0,0,,,,3.0,AT-6 C,1986-03-16 08:17:31,ventrikulÄr(a) extrasystoli(er) fÖrmaksflimmer...,...,,,,,4,records100/00000/00181_lr,records500/00000/00181_hr,[AMI],AMI,records100/00000/00181_lr
184,13112.0,74.0,0,,,,3.0,AT-6 C,1986-03-17 07:33:28,sinusrytm extrem vÄnster el-axel avvikande qrs...,...,,,,,9,records100/00000/00184_lr,records500/00000/00184_hr,"[AMI, IMI]",IMI,records100/00000/00184_lr
189,13112.0,74.0,0,,,,3.0,AT-6 C,1986-03-21 08:54:03,slag med avvikande impulsutbredning supraventr...,...,,,,,9,records100/00000/00189_lr,records500/00000/00189_hr,"[AMI, IMI]",IMI,records100/00000/00189_lr
210,16062.0,58.0,0,,78.0,2.0,0.0,CS-12 E,1986-05-16 06:46:36,sinusrhythmus ueberdrehter linkstyp,...,,,,,10,records100/00000/00210_lr,records500/00000/00210_hr,[IMI],IMI,records100/00000/00210_lr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21805,16291.0,72.0,0,,,1.0,2.0,AT-60 3,2001-02-13 13:32:09,sinusrhythmus p-sinistrocardiale ueberdrehter ...,...,,,,,3,records100/21000/21805_lr,records500/21000/21805_hr,"[AMI, LAFB/LPFB, ISCA]",AMI,records100/21000/21805_lr
21815,14433.0,82.0,1,,,1.0,2.0,AT-60 3,2001-02-25 10:58:58,sinusrhythmus a-v block i ueberdrehter linksty...,...,,,,,3,records100/21000/21815_lr,records500/21000/21815_hr,"[_AVB, IMI, LAFB/LPFB]",IMI,records100/21000/21815_lr
21826,9178.0,82.0,1,,,1.0,2.0,AT-60 3,2001-05-13 18:20:52,sinus arrhythmie ueberdrehter linkstyp linksan...,...,,,,,10,records100/21000/21826_lr,records500/21000/21826_hr,[IMI],IMI,records100/21000/21826_lr
21827,13862.0,79.0,1,,,1.0,2.0,AT-60 3,2001-05-20 16:30:53,"ventrikulÄre extrasystole(n), trigeminus supra...",...,,,"4ES,SVES",,5,records100/21000/21827_lr,records500/21000/21827_hr,[IMI],IMI,records100/21000/21827_lr


In [11]:
Y_ami = Y[Y['target'] == 'AMI']
Y_lmi = Y[Y['target'] == 'LMI']
Y_pmi = Y[Y['target'] == 'PMI']
Y_imi = Y[Y['target'] == 'IMI']
print(f"Number of AMI records: {len(Y_ami)}")
print(f"Number of LMI records: {len(Y_lmi)}")   
print(f"Number of PMI records: {len(Y_pmi)}")
print(f"Number of IMI records: {len(Y_imi)}") 

Number of AMI records: 1554
Number of LMI records: 21
Number of PMI records: 2
Number of IMI records: 1597
