## **Install Dependencies**

In [1]:
pip install wfdb wget tqdm biosppy imbalanced-learn

Collecting wfdb
  Downloading wfdb-3.4.1-py3-none-any.whl (137 kB)
[?25l[K     |██▍                             | 10 kB 38.1 MB/s eta 0:00:01[K     |████▊                           | 20 kB 39.3 MB/s eta 0:00:01[K     |███████▏                        | 30 kB 20.1 MB/s eta 0:00:01[K     |█████████▌                      | 40 kB 17.8 MB/s eta 0:00:01[K     |████████████                    | 51 kB 17.9 MB/s eta 0:00:01[K     |██████████████▎                 | 61 kB 15.8 MB/s eta 0:00:01[K     |████████████████▋               | 71 kB 14.1 MB/s eta 0:00:01[K     |███████████████████             | 81 kB 15.5 MB/s eta 0:00:01[K     |█████████████████████▍          | 92 kB 15.8 MB/s eta 0:00:01[K     |███████████████████████▉        | 102 kB 15.8 MB/s eta 0:00:01[K     |██████████████████████████▏     | 112 kB 15.8 MB/s eta 0:00:01[K     |████████████████████████████▋   | 122 kB 15.8 MB/s eta 0:00:01[K     |███████████████████████████████ | 133 kB 15.8 MB/s eta 0:00:01

In [2]:
!pip uninstall matplotlib --yes
!pip install matplotlib==3.1.3 

Found existing installation: matplotlib 3.5.1
Uninstalling matplotlib-3.5.1:
  Successfully uninstalled matplotlib-3.5.1
Collecting matplotlib==3.1.3
  Downloading matplotlib-3.1.3-cp37-cp37m-manylinux1_x86_64.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 13.9 MB/s 
Installing collected packages: matplotlib
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
wfdb 3.4.1 requires matplotlib>=3.3.4, but you have matplotlib 3.1.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed matplotlib-3.1.3


## **Importing Libraries**

In [3]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from glob import glob
import wget
import zipfile
import wfdb as wf
import os
from scipy import signal
from biosppy.signals import ecg
import cv2
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

## **Downloading and Extracting Zip Files**

In [4]:
url = 'https://physionet.org/static/published-projects/stdb/mit-bih-st-change-database-1.0.0.zip'
filename = wget.download(url)
zip = zipfile.ZipFile(filename)
zip.extractall()

In [5]:
url = 'https://physionet.org/static/published-projects/mitdb/mit-bih-arrhythmia-database-1.0.0.zip'
filename = wget.download(url)
zip = zipfile.ZipFile(filename)
zip.extractall()

In [6]:
os.rename('mit-bih-st-change-database-1.0.0','mitdb_1')
os.rename('mit-bih-arrhythmia-database-1.0.0','mitdb_2')

## **Data Preprocessing**

In [12]:
all_heart_beats_symbols = ['N','L','R','B','A','a','J','S','V','r',
             'F','e','j','n','E','/','f','Q','?']

In [8]:
!rm ./mitdb_2/102-0.atr

In [9]:
atr_records = glob('./mitdb_*/*.atr');
atr_records = [atr_record[:-4] for atr_record in atr_records];
atr_records.sort();
print('Total files: ', len(atr_records));

Total files:  76


In [14]:
full_frames = np.array([])
for full_loc in atr_records:
    paths_number = full_loc.split('/')
    fn = paths_number[-1]
    print('Loading file:', full_loc)

    signal_data = wf.rdsamp(full_loc)
    annot = wf.rdann(full_loc, 'atr')
    signal_data_t = signal_data[0].transpose()

    annot_type = np.array(annot.symbol)
    beat_rate = np.zeros_like(annot_type, dtype='float')
    for annot_type_id, annot_type_value in enumerate(annot_type):
        if (annot_type_value == 'N'):
            beat_rate[annot_type_id] = 1.0 # Normal
        elif(annot_type_value in all_heart_beats_symbols):
            beat_rate[annot_type_id] = 2.0 # Abnormal
    beat_rates = np.zeros_like(signal_data_t[0], dtype='float')
    beat_rates[annot.sample] = beat_rate

    for chan_idx, chan_name in enumerate(signal_data_t):
        SAMPLE_SIZE = signal_data[0].shape[0]
        times = (np.arange(SAMPLE_SIZE, dtype = 'float')) / signal_data[1].get('fs')
        fs = signal_data[1].get('fs')  # Sampling frequency

        fc = 150  # Cut-off frequency of the filter
        w = fc / (fs / 2) # Normalize the frequency
        b, a = signal.butter(5, w, 'low')
        chan_name = signal.filtfilt(b, a, chan_name)

        print(' --> ECG channel type name:', signal_data[1].get('sig_name')[chan_idx])

        output = ecg.ecg(signal=chan_name, sampling_rate=360, show=False)
        rpeaks = np.zeros_like(chan_name, dtype='float')
        rpeaks[output['rpeaks']] = 1.0
        
        remove_array = np.array([0])

        heart_beat = np.split(chan_name, output['rpeaks'])
        
        
        for idx, idxval in enumerate(output['rpeaks']):
            if ((idx == 0) or (idx == len(heart_beat) - 1)):
                continue
            if idxval < 10:
                fromidx = 0
            else:
                fromidx = idxval - 10
            
            toidx = idxval + 10
            annot_type_value = beat_rates[fromidx:toidx].max()
            
            if (annot_type_value == 0.0):
                remove_array = np.append(remove_array, idx)
                continue

            annot_type_value = annot_type_value - 1.0
            heart_beat[idx] = np.append(heart_beat[idx], heart_beat[idx+1][:40])
            heart_beat[idx] = (heart_beat[idx] - heart_beat[idx].min()) / heart_beat[idx].ptp()

            if (heart_beat[idx].size > 280):
                remove_array = np.append(remove_array, idx)
                continue

            zero_pad_count = 300 - heart_beat[idx].size
            heart_beat[idx] = np.pad(heart_beat[idx], (0, zero_pad_count), 'constant', constant_values=(0.0, 0.0))            
            heart_beat[idx] = np.append(heart_beat[idx], annot_type_value)

        remove_array = np.append(remove_array, len(heart_beat)-1)

        heart_beat = np.delete(heart_beat, remove_array)

        final_data = np.array(list(heart_beat[:]), dtype=np.float)
        if(full_frames.size == 0):
          full_frames = final_data
        else:
          if(final_data.size != 0):
            full_frames = np.concatenate((full_frames, final_data), axis=0)
        print("New data shape", final_data.shape, "Total Shape: ", full_frames.shape)

Loading file: ./mitdb_1/300
 --> ECG channel type name: ECG


  return array(a, dtype, copy=False, order=order)


New data shape (2534, 301) Total Shape:  (2534, 301)
 --> ECG channel type name: ECG
New data shape (3, 301) Total Shape:  (2537, 301)
Loading file: ./mitdb_1/301
 --> ECG channel type name: ECG
New data shape (862, 301) Total Shape:  (3399, 301)
 --> ECG channel type name: ECG
New data shape (863, 301) Total Shape:  (4262, 301)
Loading file: ./mitdb_1/302
 --> ECG channel type name: ECG
New data shape (1108, 301) Total Shape:  (5370, 301)
 --> ECG channel type name: ECG
New data shape (0,) Total Shape:  (5370, 301)
Loading file: ./mitdb_1/303
 --> ECG channel type name: ECG
New data shape (1357, 301) Total Shape:  (6727, 301)
 --> ECG channel type name: ECG
New data shape (1367, 301) Total Shape:  (8094, 301)
Loading file: ./mitdb_1/304
 --> ECG channel type name: ECG
New data shape (0,) Total Shape:  (8094, 301)
 --> ECG channel type name: ECG
New data shape (35, 301) Total Shape:  (8129, 301)
Loading file: ./mitdb_1/305
 --> ECG channel type name: ECG
New data shape (172, 301) Total

## **Applying SMOTE**

In [15]:
df_final_data_X = pd.DataFrame(data=full_frames[:, :-1])
df_final_data_Y = pd.DataFrame(data=full_frames[:,-1])

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(df_final_data_X, df_final_data_Y)
y_sm.value_counts()

1.0    76871
0.0    76871
dtype: int64

## **Spectrogram Generation**

In [17]:
!rm -fr data_ecg
os.makedirs('./data_ecg/Spectrogram/Abnormal');
os.makedirs('./data_ecg/Spectrogram/Normal');

In [18]:
for index, row in X_sm.iterrows():
    # In case of failure (RAM usage) uncomment this line and resume from last index (i.e  108949).
    # if(index > 108949):
      fig = plt.figure(frameon=False)
      plt.specgram(row, Fs=fs)
      plt.xticks([]), plt.yticks([])
      for spine in plt.gca().spines.values():
        spine.set_visible(False)          
      if(y_sm[0][index] == 1.0):
        filename = './data_ecg/Spectrogram/Abnormal/' + str(index)+'.png'
      else:
        filename = './data_ecg/Spectrogram/Normal/' + str(index)+'.png'
      fig.savefig(filename)
      plt.close()
      print(index);