# ECG Data Exploration

Initial exploration of the MIT-BIH Arrhythmia Database.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Load the preprocessed dataset
records = np.load('../data/train_records.npy')
labels = np.load('../data/train_labels.npy')
print(f'Records loaded: {len(records)} heartbeats')
print(f'Signal length: {records.shape[1]} samples')

Records loaded: 87554 heartbeats
Signal length: 360 samples


In [5]:
# Summary statistics per class
df = pd.DataFrame({'label': labels})
class_names = {0: 'Normal', 1: 'Supraventricular', 2: 'Ventricular', 3: 'Fusion', 4: 'Unknown'}
df['class_name'] = df['label'].map(class_names)
print(df['class_name'].value_counts())

Normal              72471
Unknown              6431
Ventricular          5788
Supraventricular     2223
Fusion                641
Name: class_name, dtype: int64


## Signal Visualization

Plot example ECG signals from each class.

In [3]:
# Plot one example from each class
fig, axes = plt.subplots(5, 1, figsize=(12, 10))
for cls_id, cls_name in class_names.items():
    idx = np.where(labels == cls_id)[0][0]
    axes[cls_id].plot(records[idx])
    axes[cls_id].set_title(cls_name)
    axes[cls_id].set_ylabel('Amplitude')
axes[4].set_xlabel('Sample')
plt.tight_layout()
plt.show()

<Figure size 1200x1000 with 5 Axes>

In [4]:
# Label distribution as percentages
label_counts = df['class_name'].value_counts()
label_pcts = label_counts / len(df) * 100
print('Class distribution (%):')
for name, pct in label_pcts.items():
    print(f'  {name}: {pct:.1f}%')

Class distribution (%):
  Normal: 82.8%
  Unknown: 7.3%
  Ventricular: 6.6%
  Supraventricular: 2.5%
  Fusion: 0.7%


In [8]:
# Signal statistics summary
print(f'Mean amplitude: {records.mean():.4f}')
print(f'Std amplitude: {records.std():.4f}')
print(f'Min amplitude: {records.min():.4f}')
print(f'Max amplitude: {records.max():.4f}')
print(f'\nDataset shape: {records.shape}')
print(f'Labels shape: {labels.shape}')
print(f'Unique labels: {np.unique(labels)}')

Mean amplitude: -0.0032
Std amplitude: 0.2145
Min amplitude: -3.4521
Max amplitude: 4.1203

Dataset shape: (87554, 360)
Labels shape: (87554,)
Unique labels: [0 1 2 3 4]
