In [7]:
from pathlib import Path
import pandas as pd

# Arrhythmia Dataset

- Number of Samples: 109446
- Number of Categories: 5
- Sampling Frequency: 125Hz
- Data Source: Physionet's MIT-BIH Arrhythmia Dataset
- Classes:
    - 'Normal': 0, 
    - 'Supraventricular ectopic beat': 1, 
    - 'Ventricular ectopic beat': 2, 
    - 'Fusion beat': 3, 
    - 'Unknown beat': 4
- All the samples are cropped, downsampled and padded with zeroes if necessary to the fixed dimension of 187. There is a target column named "target".

How is the train/test split?

In [8]:
datadir = Path('../hackathon-data/')
trainfile = datadir / 'heart_big_train.parq'
validfile = datadir / 'heart_big_valid.parq'
train_df = pd.read_parquet(trainfile)
valid_df = pd.read_parquet(validfile)
len(train_df), len(valid_df)

(87554, 10946)

And what about the balance of the target?

In [9]:
train_df.target.value_counts(), valid_df.target.value_counts()
train_percentages = train_df.target.value_counts(normalize=True) * 100
valid_percentages = valid_df.target.value_counts(normalize=True) * 100
train_percentages,  valid_percentages

(target
 0.0    82.772917
 4.0     7.345181
 2.0     6.610777
 1.0     2.539005
 3.0     0.732120
 Name: proportion, dtype: float64,
 target
 0.0    82.760826
 4.0     7.345149
 2.0     6.614288
 1.0     2.539741
 3.0     0.739996
 Name: proportion, dtype: float64)

In [24]:
test = 100.0 / train_percentages
test.to_numpy()


array([  1.20812463,  13.61436791,  15.1268141 ,  39.38551507,
       136.58970359])

What does the signal look like?

In [31]:
train_df[train_df['target'] == 4.0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178,179,180,181,182,183,184,185,186,target
81123,0.712610,0.629032,0.527859,0.414956,0.284457,0.164223,0.080645,0.032258,0.000000,0.016129,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
81124,1.000000,0.484848,0.541667,0.526515,0.522727,0.507576,0.496212,0.412879,0.284091,0.140152,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
81125,0.584046,0.500000,0.424501,0.324786,0.233618,0.128205,0.049858,0.000000,0.000000,0.028490,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
81126,1.000000,0.936170,0.878598,0.813517,0.727159,0.607009,0.496871,0.365457,0.272841,0.232791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
81127,0.680412,0.584192,0.491409,0.379725,0.269759,0.149485,0.072165,0.012027,0.000000,0.012027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87549,0.807018,0.494737,0.536842,0.529825,0.491228,0.484211,0.456140,0.396491,0.284211,0.136842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
87550,0.718333,0.605000,0.486667,0.361667,0.231667,0.120000,0.051667,0.001667,0.000000,0.013333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
87551,0.906122,0.624490,0.595918,0.575510,0.530612,0.481633,0.444898,0.387755,0.322449,0.191837,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
87552,0.858228,0.645570,0.845570,0.248101,0.167089,0.131646,0.121519,0.121519,0.118987,0.103797,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [25]:
import seaborn as sns
import matplotlib.pyplot as plt

signal = train_df.iloc[ :-1]
print(signal)

sns.lineplot(data=signal)
plt.xticks(range(0, len(signal), 25))

              0         1         2         3         4         5         6  \
0      0.977941  0.926471  0.681373  0.245098  0.154412  0.191176  0.151961   
1      0.960114  0.863248  0.461538  0.196581  0.094017  0.125356  0.099715   
2      1.000000  0.659459  0.186486  0.070270  0.070270  0.059459  0.056757   
3      0.925414  0.665746  0.541436  0.276243  0.196133  0.077348  0.071823   
4      0.967136  1.000000  0.830986  0.586854  0.356808  0.248826  0.145540   
...         ...       ...       ...       ...       ...       ...       ...   
87548  1.000000  0.405594  0.440559  0.405594  0.405594  0.384615  0.374126   
87549  0.807018  0.494737  0.536842  0.529825  0.491228  0.484211  0.456140   
87550  0.718333  0.605000  0.486667  0.361667  0.231667  0.120000  0.051667   
87551  0.906122  0.624490  0.595918  0.575510  0.530612  0.481633  0.444898   
87552  0.858228  0.645570  0.845570  0.248101  0.167089  0.131646  0.121519   

              7         8         9  ...  178  179 

  func(*args, **kwargs)
  fig.canvas.print_figure(bytes_io, **kw)


Error in callback <function flush_figures at 0x12dc41120> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

In [11]:
train_df.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '178', '179', '180', '181', '182', '183', '184', '185', '186',
       'target'],
      dtype='object', length=188)