In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA is available")
    print(f"Device count: {torch.cuda.device_count()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available")

CUDA is available
Device count: 8
Device name: Tesla V100-SXM2-32GB


In [2]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)

if torch.cuda.is_available():
    print("CUDA is available")
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available")

PyTorch version: 2.4.0
CUDA version: 12.1
CUDA is available
Device 0: Tesla V100-SXM2-32GB
Device 1: Tesla V100-SXM2-32GB
Device 2: Tesla V100-SXM2-32GB
Device 3: Tesla V100-SXM2-32GB
Device 4: Tesla V100-SXM2-32GB
Device 5: Tesla V100-SXM2-32GB
Device 6: Tesla V100-SXM2-32GB
Device 7: Tesla V100-SXM2-32GB


In [1]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [18]:
# Path to the CSV file
csv_file_for_labels = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/ptbxl_train_label_df.csv'
# Path to the image directory
data_dir = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/records100_ground_truth'

# Load the CSV file
label_df = pd.read_csv(csv_file_for_labels)
train_df = label_df.sample(frac = 0.8)
test_df = label_df.drop(train_df.index)
label_df.head()

Unnamed: 0.2,Unnamed: 0.1,index,Unnamed: 0,ecg_id,patient_id,filename_lr,filename_hr,Normal_ECG,ecg_lr_path
0,0,108,108,109,21312.0,records100/00000/00109_lr,records500/00000/00109_hr,True,00109_lr
1,1,19314,19314,19353,19389.0,records100/19000/19353_lr,records500/19000/19353_hr,False,19353_lr
2,2,12707,12707,12739,16579.0,records100/12000/12739_lr,records500/12000/12739_hr,True,12739_lr
3,3,18414,18414,18453,21182.0,records100/18000/18453_lr,records500/18000/18453_hr,False,18453_lr
4,4,10879,10879,10906,14854.0,records100/10000/10906_lr,records500/10000/10906_hr,True,10906_lr


In [19]:
len(train_df)

12556

In [20]:
len(test_df)

3139

In [4]:
# Define transformations
img_size = 224  # or whatever size you want
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    normalize,
])

In [5]:
import os
from torch.utils.data import Dataset, DataLoader
from PIL import Image

class ECGImageDataset(Dataset):
    def __init__(self, label_df, image_dir, transform=None):
        self.label_df = label_df
        self.image_dir = image_dir
        self.transform = transform
        self.image_paths = self._get_image_paths()

    def _get_image_paths(self):
        image_paths = []
        for root, _, files in os.walk(self.image_dir):
            for file in files:
                if file.endswith('.png'):
                    image_paths.append(os.path.join(root, file))
        return image_paths

    def __len__(self):
        return len(self.label_df)

    def __getitem__(self, idx):
        img_name = self.label_df.iloc[idx]['ecg_lr_path'] + '-0.png'
        matching_paths = [path for path in self.image_paths if img_name in path]
        
        # Use the first match if it exists
        img_path = matching_paths[0] if matching_paths else None
        
        while img_path is None:
            idx += 1
            img_name = self.label_df.iloc[idx]['ecg_lr_path'] + '-0.png'
            matching_paths = [path for path in self.image_paths if img_name in path]
            # Use the first match if it exists
            img_path = matching_paths[0] if matching_paths else None

        image = Image.open(img_path).convert('RGB')
        label = self.label_df.iloc[idx]['Normal_ECG']
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

train_dataset = ECGImageDataset(train_df, data_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=False)

test_dataset = ECGImageDataset(test_df, data_dir, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=False)

# Example of iterating through the dataloader
for images, labels in train_loader:
    print(images.shape, labels.shape)
    break


torch.Size([32, 3, 224, 224]) torch.Size([32])


In [7]:
print(labels.int())

tensor([0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 0, 1, 0, 0, 0], dtype=torch.int32)


In [8]:
import os
import shutil

import torch
import torch.utils.data
# import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets

import argparse
import re
import pandas as pd

from helpers import makedir
import model
import push
import train_and_test as tnt
import save
from log import create_logger
from preprocess import mean, std, preprocess_input_function

In [16]:
import time
import torch

from helpers import list_of_distances, make_one_hot

import time
import torch
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score

def _train_or_test(model, dataloader, optimizer=None, class_specific=True, use_l1_mask=True,
                   coefs=None, log=print):
    is_train = optimizer is not None
    start = time.time()
    n_examples = 0
    n_correct = 0
    n_batches = 0
    total_cross_entropy = 0
    total_cluster_cost = 0
    total_separation_cost = 0
    total_avg_separation_cost = 0

    all_labels = []
    all_scores = []

    for i, (image, label) in enumerate(dataloader):
        if i > 3:
            continue
        input = image.cuda()
        target = label.cuda()

        grad_req = torch.enable_grad() if is_train else torch.no_grad()
        with grad_req:
            output, min_distances = model(input)

            cross_entropy = torch.nn.functional.cross_entropy(output, target)

            if class_specific:
                max_dist = (model.module.prototype_shape[1]
                            * model.module.prototype_shape[2]
                            * model.module.prototype_shape[3])

                prototypes_of_correct_class = torch.t(model.module.prototype_class_identity[:, label]).cuda()
                inverted_distances, _ = torch.max((max_dist - min_distances) * prototypes_of_correct_class, dim=1)
                cluster_cost = torch.mean(max_dist - inverted_distances)

                prototypes_of_wrong_class = 1 - prototypes_of_correct_class
                inverted_distances_to_nontarget_prototypes, _ = \
                    torch.max((max_dist - min_distances) * prototypes_of_wrong_class, dim=1)
                separation_cost = torch.mean(max_dist - inverted_distances_to_nontarget_prototypes)

                avg_separation_cost = \
                    torch.sum(min_distances * prototypes_of_wrong_class, dim=1) / torch.sum(prototypes_of_wrong_class, dim=1)
                avg_separation_cost = torch.mean(avg_separation_cost)
                
                if use_l1_mask:
                    l1_mask = 1 - torch.t(model.module.prototype_class_identity).cuda()
                    l1 = (model.module.last_layer.weight * l1_mask).norm(p=1)
                else:
                    l1 = model.module.last_layer.weight.norm(p=1)

            else:
                min_distance, _ = torch.min(min_distances, dim=1)
                cluster_cost = torch.mean(min_distance)
                l1 = model.module.last_layer.weight.norm(p=1)

            _, predicted = torch.max(output.data, 1)
            n_examples += target.size(0)
            n_batches += 1
            n_correct += (predicted == target).sum().item()

            total_cross_entropy += cross_entropy.item()
            total_cluster_cost += cluster_cost.item()
            total_separation_cost += separation_cost.item()
            total_avg_separation_cost += avg_separation_cost.item()

            # Append to all_labels and all_scores
            all_labels.extend(target.cpu().numpy())
            all_scores.extend(output.softmax(dim=1).detach().cpu().numpy())

        if is_train:
            if class_specific:
                if coefs is not None:
                    loss = (coefs['crs_ent'] * cross_entropy
                          + coefs['clst'] * cluster_cost
                          + coefs['sep'] * separation_cost
                          + coefs['l1'] * l1)
                else:
                    loss = cross_entropy + 0.8 * cluster_cost - 0.08 * separation_cost + 1e-4 * l1
            else:
                if coefs is not None:
                    loss = (coefs['crs_ent'] * cross_entropy
                          + coefs['clst'] * cluster_cost
                          + coefs['l1'] * l1)
                else:
                    loss = cross_entropy + 0.8 * cluster_cost + 1e-4 * l1
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        del input
        del target
        del output
        del predicted
        del min_distances

    end = time.time()

    log('\ttime: \t{0}'.format(end - start))
    log('\tcross ent: \t{0}'.format(total_cross_entropy / n_batches))
    log('\tcluster: \t{0}'.format(total_cluster_cost / n_batches))
    if class_specific:
        log('\tseparation:\t{0}'.format(total_separation_cost / n_batches))
        log('\tavg separation:\t{0}'.format(total_avg_separation_cost / n_batches))
    log('\taccu: \t\t{0}%'.format(n_correct / n_examples * 100))
    log('\tl1: \t\t{0}'.format(model.module.last_layer.weight.norm(p=1).item()))
    p = model.module.prototype_vectors.view(model.module.num_prototypes, -1).cpu()
    with torch.no_grad():
        p_avg_pair_dist = torch.mean(list_of_distances(p, p))
    log('\tp dist pair: \t{0}'.format(p_avg_pair_dist.item()))

    # Calculate metrics
    accuracy = accuracy_score(all_labels, [score.argmax() for score in all_scores])
    f1 = f1_score(all_labels, [score.argmax() for score in all_scores], average='weighted')
    auroc = roc_auc_score(all_labels, all_scores[:, 0], multi_class='ovr', average='weighted')

    log(f'\tAccuracy: {accuracy * 100:.2f}%')
    log(f'\tF1 Score: {f1:.4f}')
    log(f'\tAUROC: {auroc:.4f}')

    return n_correct / n_examples


def train(model, dataloader, optimizer, class_specific=False, coefs=None, log=print):
    assert(optimizer is not None)
    
    log('\ttrain')
    model.train()
    return _train_or_test(model=model, dataloader=dataloader, optimizer=optimizer,
                          class_specific=class_specific, coefs=coefs, log=log)


def test(model, dataloader, class_specific=False, log=print):
    log('\ttest')
    model.eval()
    return _train_or_test(model=model, dataloader=dataloader, optimizer=None,
                          class_specific=class_specific, log=log)


def last_only(model, log=print):
    for p in model.module.features.parameters():
        p.requires_grad = False
    for p in model.module.add_on_layers.parameters():
        p.requires_grad = False
    model.module.prototype_vectors.requires_grad = False
    for p in model.module.last_layer.parameters():
        p.requires_grad = True
    
    log('\tlast layer')


def warm_only(model, log=print):
    for p in model.module.features.parameters():
        p.requires_grad = False
    for p in model.module.add_on_layers.parameters():
        p.requires_grad = True
    model.module.prototype_vectors.requires_grad = True
    for p in model.module.last_layer.parameters():
        p.requires_grad = True
    
    log('\twarm')


def joint(model, log=print):
    for p in model.module.features.parameters():
        p.requires_grad = True
    for p in model.module.add_on_layers.parameters():
        p.requires_grad = True
    model.module.prototype_vectors.requires_grad = True
    for p in model.module.last_layer.parameters():
        p.requires_grad = True
    
    log('\tjoint')


In [None]:
# book keeping namings and code
from settings import base_architecture, img_size, prototype_shape, num_classes, \
                     prototype_activation_function, add_on_layers_type, experiment_run

base_architecture_type = re.match('^[a-z]*', base_architecture).group(0)

model_dir = './saved_models/' + base_architecture + '/' + experiment_run + '/'
makedir(model_dir)
# shutil.copy(src=os.path.join(os.getcwd(), __file__), dst=model_dir)
shutil.copy(src=os.path.join(os.getcwd(), 'settings.py'), dst=model_dir)
shutil.copy(src=os.path.join(os.getcwd(), base_architecture_type + '_features.py'), dst=model_dir)
shutil.copy(src=os.path.join(os.getcwd(), 'model.py'), dst=model_dir)
shutil.copy(src=os.path.join(os.getcwd(), 'train_and_test.py'), dst=model_dir)

log, logclose = create_logger(log_filename=os.path.join(model_dir, 'train.log'))
img_dir = os.path.join(model_dir, 'img')
makedir(img_dir)
weight_matrix_filename = 'outputL_weights'
prototype_img_filename_prefix = 'prototype-img'
prototype_self_act_filename_prefix = 'prototype-self-act'
proto_bound_boxes_filename_prefix = 'bb'

# load the data
from settings import train_dir, test_dir, train_push_dir, \
                     train_batch_size, test_batch_size, train_push_batch_size
# ---------------------------------------------------------------
# Updated data loader code
from settings import data_dir, csv_file_for_labels
from dataset_class import ECGImageDataset

# Define transformations
img_size = 224  # or whatever size you want
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    normalize,
])

# Load the CSV file
label_df = pd.read_csv(csv_file_for_labels)

train_dataset = ECGImageDataset(label_df, data_dir, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=4, pin_memory=False)

# we should look into distributed sampler more carefully at torch.utils.data.distributed.DistributedSampler(train_dataset)
log('training set size: {0}'.format(len(train_loader.dataset)))
# log('push set size: {0}'.format(len(train_push_loader.dataset)))
# log('test set size: {0}'.format(len(test_loader.dataset)))
log('batch size: {0}'.format(train_batch_size))

# construct the model
base_architecture = 'resnet18'
ppnet = model.construct_PPNet(base_architecture=base_architecture,
                              pretrained=True, img_size=img_size,
                              prototype_shape=prototype_shape,
                              num_classes=num_classes,
                              prototype_activation_function=prototype_activation_function,
                              add_on_layers_type=add_on_layers_type)
#if prototype_activation_function == 'linear':
#    ppnet.set_last_layer_incorrect_connection(incorrect_strength=0)
ppnet = ppnet.to('cuda')
ppnet_multi = torch.nn.DataParallel(ppnet)
class_specific = True

# define optimizer
from settings import joint_optimizer_lrs, joint_lr_step_size
joint_optimizer_specs = \
[{'params': ppnet.features.parameters(), 'lr': joint_optimizer_lrs['features'], 'weight_decay': 1e-3}, # bias are now also being regularized
 {'params': ppnet.add_on_layers.parameters(), 'lr': joint_optimizer_lrs['add_on_layers'], 'weight_decay': 1e-3},
 {'params': ppnet.prototype_vectors, 'lr': joint_optimizer_lrs['prototype_vectors']},
]
joint_optimizer = torch.optim.Adam(joint_optimizer_specs)
joint_lr_scheduler = torch.optim.lr_scheduler.StepLR(joint_optimizer, step_size=joint_lr_step_size, gamma=0.1)

from settings import warm_optimizer_lrs
warm_optimizer_specs = \
[{'params': ppnet.add_on_layers.parameters(), 'lr': warm_optimizer_lrs['add_on_layers'], 'weight_decay': 1e-3},
 {'params': ppnet.prototype_vectors, 'lr': warm_optimizer_lrs['prototype_vectors']},
]
warm_optimizer = torch.optim.Adam(warm_optimizer_specs)

from settings import last_layer_optimizer_lr
last_layer_optimizer_specs = [{'params': ppnet.last_layer.parameters(), 'lr': last_layer_optimizer_lr}]
last_layer_optimizer = torch.optim.Adam(last_layer_optimizer_specs)

# weighting of different training losses
from settings import coefs

# number of training epochs, number of warm epochs, push start epoch, push epochs
from settings import num_train_epochs, num_warm_epochs, push_start, push_epochs

# train the model
log('start training')
import copy
for epoch in range(num_train_epochs):
    log('epoch: \t{0}'.format(epoch))

    if epoch < num_warm_epochs:
        tnt.warm_only(model=ppnet_multi, log=log)
        _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=warm_optimizer,
                      class_specific=class_specific, coefs=coefs, log=log)
    else:
        tnt.joint(model=ppnet_multi, log=log)
        joint_lr_scheduler.step()
        _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=joint_optimizer,
                      class_specific=class_specific, coefs=coefs, log=log)

    accu = tnt.test(model=ppnet_multi, dataloader=train_loader,  # CHANGE TEST_LOADER TO TRAIN_LOADER
                    class_specific=class_specific, log=log)
    save.save_model_w_condition(model=ppnet, model_dir=model_dir, model_name=str(epoch) + 'nopush', accu=accu,
                                target_accu=0.70, log=log)

    if epoch >= push_start and epoch in push_epochs:
        push.push_prototypes(
            train_loader, # pytorch dataloader (must be unnormalized in [0,1])   # CHANGE TRAIN_PUSH_LOADER TO TRAIN_LOADER
            prototype_network_parallel=ppnet_multi, # pytorch network with prototype_vectors
            class_specific=class_specific,
            preprocess_input_function=preprocess_input_function, # normalize if needed
            prototype_layer_stride=1,
            root_dir_for_saving_prototypes=img_dir, # if not None, prototypes will be saved here
            epoch_number=epoch, # if not provided, prototypes saved previously will be overwritten
            prototype_img_filename_prefix=prototype_img_filename_prefix,
            prototype_self_act_filename_prefix=prototype_self_act_filename_prefix,
            proto_bound_boxes_filename_prefix=proto_bound_boxes_filename_prefix,
            save_prototype_class_identity=True,
            log=log)
        accu = tnt.test(model=ppnet_multi, dataloader=train_loader,  # CHANGE TEST_LOADER TO TRAIN_LOADER
                        class_specific=class_specific, log=log)
        save.save_model_w_condition(model=ppnet, model_dir=model_dir, model_name=str(epoch) + 'push', accu=accu,
                                    target_accu=0.70, log=log)
 
        if prototype_activation_function != 'linear':
            tnt.last_only(model=ppnet_multi, log=log)
            for i in range(20):
                log('iteration: \t{0}'.format(i))
                _ = tnt.train(model=ppnet_multi, dataloader=train_loader, optimizer=last_layer_optimizer,
                              class_specific=class_specific, coefs=coefs, log=log)
                accu = tnt.test(model=ppnet_multi, dataloader=train_loader, # CHANGE TEST_LOADER TO TRAIN_LOADER
                                class_specific=class_specific, log=log)
                save.save_model_w_condition(model=ppnet, model_dir=model_dir, model_name=str(epoch) + '_' + str(i) + 'push', accu=accu,
                                            target_accu=0.70, log=log)
   
logclose()

In [1]:
def log_to_file_and_console(message, logfile='results.txt'):
    print(message)
    with open(logfile, 'a') as f:
        f.write(message + '\n')

log_to_file_and_console("Test message")


Test message


In [None]:
### Creating df for 5 class classification

In [3]:
import pandas as pd
import ast
import numpy as np

scp_statements_path = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/scp_statements.csv'
database_path = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/ptbxl_database.csv'

In [4]:
# scp statements file
df = pd.read_csv(scp_statements_path)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,description,diagnostic,form,rhythm,diagnostic_class,diagnostic_subclass,Statement Category,SCP-ECG Statement Description,AHA code,aECG REFID,CDISC Code,DICOM Code
0,NDT,non-diagnostic T abnormalities,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,non-diagnostic T abnormalities,,,,
1,NST_,non-specific ST changes,1.0,1.0,,STTC,NST_,Basic roots for coding ST-T changes and abnorm...,non-specific ST changes,145.0,MDC_ECG_RHY_STHILOST,,
2,DIG,digitalis-effect,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,suggests digitalis-effect,205.0,,,
3,LNGQT,long QT-interval,1.0,1.0,,STTC,STTC,other ST-T descriptive statements,long QT-interval,148.0,,,
4,NORM,normal ECG,1.0,,,NORM,NORM,Normal/abnormal,normal ECG,1.0,,,F-000B7


In [6]:
df['diagnostic_class'].unique()

array(['STTC', 'NORM', 'MI', 'HYP', 'CD', nan], dtype=object)

In [7]:
# database
df2 = pd.read_csv(database_path)
df2.head()

Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [8]:
print(len(df2))
print(len(df2.loc[df2['validated_by_human'] == True]))

21799
16056


In [9]:
# load and convert annotation data
Y = pd.read_csv(database_path, index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

In [10]:
Y.scp_codes

ecg_id
1                 {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2                             {'NORM': 80.0, 'SBRAD': 0.0}
3                               {'NORM': 100.0, 'SR': 0.0}
4                               {'NORM': 100.0, 'SR': 0.0}
5                               {'NORM': 100.0, 'SR': 0.0}
                               ...                        
21833    {'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...
21834             {'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}
21835                           {'ISCAS': 50.0, 'SR': 0.0}
21836                           {'NORM': 100.0, 'SR': 0.0}
21837                           {'NORM': 100.0, 'SR': 0.0}
Name: scp_codes, Length: 21799, dtype: object

In [11]:
# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(scp_statements_path, index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [12]:
Y.head(20)

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,diagnostic_superclass
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr,[NORM]
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr,[NORM]
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr,[NORM]
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr,[NORM]
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr,[NORM]
6,19005.0,18.0,1,,58.0,2.0,0.0,CS-12 E,1984-11-28 13:32:13,sinusrhythmus normales ekg,...,", V1",,,,,,4,records100/00000/00006_lr,records500/00000/00006_hr,[NORM]
7,16193.0,54.0,0,,83.0,2.0,0.0,CS-12 E,1984-11-28 13:32:22,"sinusrhythmus linkstyp t abnormal, wahrscheinl...",...,,,,,,,7,records100/00000/00007_lr,records500/00000/00007_hr,[NORM]
8,11275.0,48.0,0,,95.0,2.0,0.0,CS-12 E,1984-12-01 14:49:52,sinusrhythmus linkstyp qrs(t) abnormal infe...,...,", II,AVF",", I-AVF,",,,,,9,records100/00000/00008_lr,records500/00000/00008_hr,[MI]
9,18792.0,55.0,0,,70.0,2.0,0.0,CS-12 E,1984-12-08 09:44:43,sinusrhythmus normales ekg,...,,", I-AVR,",,,,,10,records100/00000/00009_lr,records500/00000/00009_hr,[NORM]
10,9456.0,22.0,1,,56.0,2.0,0.0,CS-12 E,1984-12-12 14:12:46,sinusrhythmus normales ekg,...,,,,,,,9,records100/00000/00010_lr,records500/00000/00010_hr,[NORM]


In [13]:
# Split data into train and test
test_fold = 10

# Train
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass
# Test
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass

In [14]:
# Filter to get only elements with one class
y_train_single_class = y_train[y_train.apply(lambda x: len(x) == 1)]

# Get unique classes in the filtered elements
unique_classes = np.unique(y_train_single_class)

print(unique_classes)
len(y_train_single_class)

[list(['CD']) list(['HYP']) list(['MI']) list(['NORM']) list(['STTC'])]


14594

In [15]:
y_train_single_class[:30]

ecg_id
1     [NORM]
2     [NORM]
3     [NORM]
4     [NORM]
5     [NORM]
6     [NORM]
7     [NORM]
8       [MI]
10    [NORM]
11    [NORM]
12    [NORM]
13    [NORM]
14    [NORM]
15    [NORM]
16    [NORM]
19    [NORM]
21    [NORM]
22    [STTC]
24    [NORM]
25    [NORM]
26    [STTC]
27    [NORM]
28    [STTC]
29    [NORM]
30     [HYP]
31    [NORM]
32      [CD]
33    [NORM]
35    [NORM]
36    [NORM]
Name: diagnostic_superclass, dtype: object

In [16]:
from sklearn.preprocessing import LabelEncoder
# Flatten the list structure
y_train_flat = y_train_single_class.apply(lambda x: x[0])

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder and transform the labels to integer encoded labels
y_train_encoded = label_encoder.fit_transform(y_train_flat)

print("Integer Encoded Labels: ", y_train_encoded[:30])

Integer Encoded Labels:  [3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 4 3 3 4 3 4 3 1 3 0 3 3 3]


In [17]:
# Print the mapping of integers to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping: ", label_mapping)

Label Mapping:  {'CD': 0, 'HYP': 1, 'MI': 2, 'NORM': 3, 'STTC': 4}


In [18]:
import os
import pandas as pd
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms

scp_statements_path = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/scp_statements.csv'
database_path = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/ptbxl_database.csv'
img_dir = "../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/records100_ground_truth"

# Define transformations
img_size = 224  # or whatever size you want
transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class ECGImageDataset(Dataset):
    def __init__(self, scp_statements_df_path, ptb_xl_database_df_path, image_dir, transform=None, test=False):
        self.scp_statements_df_path = scp_statements_df_path
        self.ptb_xl_database_df_path = ptb_xl_database_df_path
        self.image_dir = image_dir
        self.transform = transform
        self.ecg_labels, self.ecg_paths = self.get_ecg_paths_and_labels(test)
        self.image_paths = self._get_image_paths()

    def get_ecg_paths_and_labels(self, test):
        # Load the database file
        ptb_xl_database_df = pd.read_csv(self.ptb_xl_database_df_path, index_col='ecg_id')
        ptb_xl_database_df.scp_codes = ptb_xl_database_df.scp_codes.apply(lambda x: ast.literal_eval(x))

        # Load scp_statements.csv for diagnostic aggregation
        agg_df = pd.read_csv(self.scp_statements_df_path, index_col=0)
        agg_df = agg_df[agg_df.diagnostic == 1]

        def aggregate_diagnostic(y_dic):
            tmp = []
            for key in y_dic.keys():
                if key in agg_df.index:
                    tmp.append(agg_df.loc[key].diagnostic_class)
            return list(set(tmp))

        # Apply diagnostic superclass
        ptb_xl_database_df['diagnostic_superclass'] = ptb_xl_database_df.scp_codes.apply(aggregate_diagnostic)
        Y = ptb_xl_database_df

        # Split data into train and test
        test_fold = 10
        y_train = Y[Y.strat_fold != test_fold]
        y_test = Y[Y.strat_fold == test_fold]

        if test:
            y = y_test
        else:
            y = y_train

        y_file_names = y.filename_lr.apply(lambda x: x.split('/')[-1])

        # Filter to get only elements with one class
        y_single_class = y[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]

        # Flatten the list structure
        y_single_class_flat = y_single_class.diagnostic_superclass.apply(lambda x: x[0])

        # Initialize the label encoder
        label_encoder = LabelEncoder()

        # Fit the label encoder and transform the labels to integer encoded labels
        y_encoded = label_encoder.fit_transform(y_single_class_flat)

        return y_encoded, y_file_names.loc[y_single_class.index]

    def _get_image_paths(self):
        image_paths = []
        for root, _, files in os.walk(self.image_dir):
            for file in files:
                if file.endswith('.png'):
                    image_paths.append(os.path.join(root, file))
        return image_paths

    def __len__(self):
        return len(self.ecg_labels)

    def __getitem__(self, idx):
        img_name = self.ecg_paths.iloc[idx] + '-0.png'
        matching_paths = [path for path in self.image_paths if img_name in path]

        if not matching_paths:
            raise FileNotFoundError(f"Image {img_name} not found in the dataset.")

        img_path = matching_paths[0]
        image = Image.open(img_path).convert('RGB')
        label = self.ecg_labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label).long()

# Initialize dataset and dataloader for testing
test_dataset = ECGImageDataset(scp_statements_path, database_path, img_dir, transform=transform, test=True)
test_dataloader = DataLoader(test_dataset, batch_size=32)
print(len(test_dataset))

# Initialize dataset and dataloader for training
train_dataset = ECGImageDataset(scp_statements_path, database_path, img_dir, transform=transform, test=False)
train_dataloader = DataLoader(train_dataset, batch_size=32)
print(len(train_dataset))

# Print the shapes of the examples and labels from the train dataloader
for examples, labels in train_dataloader:
    print(examples.shape, labels.shape)
    break


1650
14594
torch.Size([32, 3, 224, 224]) torch.Size([32])


In [19]:
examples[0].shape

torch.Size([3, 224, 224])

In [40]:
### CODE TO CREATE LABELS FILES

In [41]:
import os
img_dir = "../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/records100_ground_truth"

image_paths = []
for root, _, files in os.walk(img_dir):
    for file in files:
        if file.endswith('.png'):
            image_paths.append(os.path.join(root, file))
# image_paths

In [52]:
import os
import pandas as pd
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms

test = False
# Load the database file
ptb_xl_database_df = pd.read_csv(database_path, index_col='ecg_id')
ptb_xl_database_df.scp_codes = ptb_xl_database_df.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(scp_statements_path, index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

# Apply diagnostic superclass
ptb_xl_database_df['diagnostic_superclass'] = ptb_xl_database_df.scp_codes.apply(aggregate_diagnostic)
Y = ptb_xl_database_df

# Split data into train and test
test_fold = 10
y_train = Y[Y.strat_fold != test_fold]
y_test = Y[Y.strat_fold == test_fold]

if test:
    y = y_test
else:
    y = y_train
y_file_names = y.filename_lr.apply(lambda x: x.split('/')[-1])

# Filter to get only elements with one class
y_single_class = y[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]
y_file_names = y_file_names[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]

# Flatten the list structure
y_single_class_flat = y_single_class.diagnostic_superclass.apply(lambda x: x[0])

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder and transform the labels to integer encoded labels
y_encoded = label_encoder.fit_transform(y_single_class_flat)

# Print the mapping of integers to original labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping: ", label_mapping)

y_labels = y_encoded
y_paths = y_file_names.loc[y_single_class.index]
# Reset index 
y_paths.reset_index(drop=True, inplace=True)
y_paths.index += 0  # Update index to start from 0
y_paths.index.name = 'index'

Label Mapping:  {'CD': 0, 'HYP': 1, 'MI': 2, 'NORM': 3, 'STTC': 4}


In [53]:
len(y_paths)

14594

In [54]:
len(y_labels)

14594

In [55]:
# Create an empty list to store the matched paths and labels
data = []

# Iterate over y_paths and image_paths to find matches and create rows for the dataframe
for i in range(len(y_paths)):
    y_path = y_paths[i]
    for j in range(len(image_paths)):
        img_path = image_paths[j]
        if y_path in img_path:
            data.append([img_path, y_labels[i]])

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data, columns=['Image Path', 'Label'])

# Print the first few rows of the DataFrame to verify
print(df.head())
print(f"Total matches: {len(df)}")

                                          Image Path  Label
0  ../../../data/padmalab_external/special_projec...      3
1  ../../../data/padmalab_external/special_projec...      3
2  ../../../data/padmalab_external/special_projec...      3
3  ../../../data/padmalab_external/special_projec...      3
4  ../../../data/padmalab_external/special_projec...      3
Total matches: 12158


In [None]:
# Run this cell only for test

In [51]:
df.to_csv('test-100HZ-files-and-labels.csv', index=False)

In [56]:
# Run the following 4 cells only for train

In [58]:
for i in range(12158):
    if 'checkpoint' in df.iloc[i]["Image Path"]:
        print("contains")
        print(i)

contains
1
contains
11506


In [59]:
updated_df = df.drop([1, 11506])

In [60]:
len(updated_df)

12156

In [62]:
updated_df.to_csv('train-100HZ-files-and-labels.csv', index=False)

In [64]:
load_test = pd.read_csv('train-100HZ-files-and-labels.csv')
load_test.head()

Unnamed: 0,Image Path,Label
0,../../../data/padmalab_external/special_projec...,3
1,../../../data/padmalab_external/special_projec...,3
2,../../../data/padmalab_external/special_projec...,3
3,../../../data/padmalab_external/special_projec...,3
4,../../../data/padmalab_external/special_projec...,3


In [None]:
idx = 14592
img_name = y_paths.iloc[idx] + '-0.png'
print(img_name)
matching_paths = [path for path in image_paths if img_name in path]
if not matching_paths:
    idx = 0
while not matching_paths:
    print(f'Here {idx}\n')
    img_name = y_paths.iloc[idx] + '-0.png'
    matching_paths = [path for path in image_paths if img_name in path]
    idx += 1
    print(img_name)
        
matching_paths[0]

In [None]:
import os
import pandas as pd
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms

scp_statements_path = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/scp_statements.csv'
database_path = '../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/ptbxl_database.csv'
img_dir = "../../../data/padmalab_external/special_project/physionet.org/files/ptb-xl/1.0.3/records100_ground_truth"

# Define transformations
img_size = 224  # or whatever size you want
transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class ECGImageDataset(Dataset):
    def __init__(self, scp_statements_df_path, ptb_xl_database_df_path, image_dir, transform=None, test=False):
        self.scp_statements_df_path = scp_statements_df_path
        self.ptb_xl_database_df_path = ptb_xl_database_df_path
        self.image_dir = image_dir
        self.transform = transform
        self.ecg_labels, self.ecg_paths = self.get_ecg_paths_and_labels(test)
        self.image_paths = self._get_image_paths()

    def get_ecg_paths_and_labels(self, test):
        # Load the database file
        ptb_xl_database_df = pd.read_csv(self.ptb_xl_database_df_path, index_col='ecg_id')
        ptb_xl_database_df.scp_codes = ptb_xl_database_df.scp_codes.apply(lambda x: ast.literal_eval(x))

        # Load scp_statements.csv for diagnostic aggregation
        agg_df = pd.read_csv(self.scp_statements_df_path, index_col=0)
        agg_df = agg_df[agg_df.diagnostic == 1]

        def aggregate_diagnostic(y_dic):
            tmp = []
            for key in y_dic.keys():
                if key in agg_df.index:
                    tmp.append(agg_df.loc[key].diagnostic_class)
            return list(set(tmp))

        # Apply diagnostic superclass
        ptb_xl_database_df['diagnostic_superclass'] = ptb_xl_database_df.scp_codes.apply(aggregate_diagnostic)
        Y = ptb_xl_database_df

        # Split data into train and test
        test_fold = 10
        y_train = Y[Y.strat_fold != test_fold]
        y_test = Y[Y.strat_fold == test_fold]

        if test:
            y = y_test
        else:
            y = y_train
        y_file_names = y.filename_lr.apply(lambda x: x.split('/')[-1])

        # Filter to get only elements with one class
        y_single_class = y[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]
        y_file_names = y_file_names[y.diagnostic_superclass.apply(lambda x: len(x) == 1)]

        # Flatten the list structure
        y_single_class_flat = y_single_class.diagnostic_superclass.apply(lambda x: x[0])

        # Initialize the label encoder
        label_encoder = LabelEncoder()

        # Fit the label encoder and transform the labels to integer encoded labels
        y_encoded = label_encoder.fit_transform(y_single_class_flat)
        # print(y_file_names.loc[y_single_class.index])

        # Print the mapping of integers to original labels
        label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        print("Label Mapping: ", label_mapping)

        y_labels = y_encoded
        y_paths = y_file_names.loc[y_single_class.index]
        
        # Reset index 
        y_paths.reset_index(drop=True, inplace=True)
        y_paths.index += 0  # Update index to start from 0
        y_paths.index.name = 'index'

        return y_labels, y_paths

    def _get_image_paths(self):
        image_paths = []
        for root, _, files in os.walk(self.image_dir):
            for file in files:
                if file.endswith('.png'):
                    image_paths.append(os.path.join(root, file))
        return image_paths

    def __len__(self):
        return len(self.ecg_labels)

    def __getitem__(self, idx):
        img_name = self.ecg_paths.iloc[idx] + '-0.png'
        matching_paths = [path for path in self.image_paths if img_name in path]

        if not matching_paths:
            idx = 0
        while not matching_paths:
            print(f'Here\n')
            img_name = self.ecg_paths.iloc[idx] + '-0.png'
            matching_paths = [path for path in self.image_paths if img_name in path]
            idx += 1

        img_path = matching_paths[0]
        image = Image.open(img_path).convert('RGB')
        label = self.ecg_labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label).long()

# Initialize dataset and dataloader for training
train_dataset = ECGImageDataset(scp_statements_path, database_path, img_dir, transform=transform, test=False)
train_dataloader = DataLoader(train_dataset, batch_size=200)
print(len(train_dataset))

# Print the shapes of the examples and labels from the train dataloader
from tqdm import tqdm
for examples, labels in tqdm(train_dataloader):
    print(examples.shape, labels.shape)