In [None]:
from pathlib import Path
import pandas as pd

In [None]:
import tomllib

datadir = Path('../data')
configfile = Path("config.toml")

with configfile.open('rb') as f:
    config = tomllib.load(f)

print(config)


# The PTB Diagnostic ECG Database

- Number of Samples: 14552
- Number of Categories: 2
- Sampling Frequency: 125Hz
- Data Source: Physionet's PTB Diagnostic Database

All the samples are cropped, downsampled and padded with zeroes if necessary to the fixed dimension of 187. There is a target column named "target".

Lets see how much rows we have

In [None]:
trainfile = datadir / (config['ptb'] + '_train.parq')
testfile = datadir / (config['ptb'] + '_test.parq')

train_df = pd.read_parquet(trainfile)
test_df = pd.read_parquet(testfile)
len(train_df), len(test_df)

And lets check how balanced the dataset is

In [None]:
train_df.target.value_counts(), test_df.target.value_counts()
train_percentages = train_df.target.value_counts(normalize=True) * 100
test_percentages = test_df.target.value_counts(normalize=True) * 100
train_percentages, test_percentages

In [None]:
train_df.columns

The last column is the target column

What does a signal look like?

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

signal = train_df.iloc[2, :-1]
sns.lineplot(data=signal)
plt.xticks(range(0, len(signal), 25));
print(signal)


# Arrhythmia Dataset

- Number of Samples: 109446
- Number of Categories: 5
- Sampling Frequency: 125Hz
- Data Source: Physionet's MIT-BIH Arrhythmia Dataset
- Classes: ['N': 0, 'S': 1, 'V': 2, 'F': 3, 'Q': 4]
All the samples are cropped, downsampled and padded with zeroes if necessary to the fixed dimension of 187. There is a target column named "target".

How is the train/test split?

In [None]:
trainfile = datadir / (config['arrhythmia'] + '_train.parq')
testfile = datadir / (config['arrhythmia'] + '_test.parq')


train_df = pd.read_parquet(trainfile)
test_df = pd.read_parquet(testfile)
len(train_df), len(test_df)

And what about the balance of the target?

In [None]:
train_df.target.value_counts(), test_df.target.value_counts()
train_percentages = train_df.target.value_counts(normalize=True) * 100
test_percentages = test_df.target.value_counts(normalize=True) * 100
train_percentages, test_percentages

What does the signal look like?

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

signal = train_df.iloc[2, :-1]
print(signal)

sns.lineplot(data=signal)
plt.xticks(range(0, len(signal), 25));