In [37]:
import sys
import wfdb
import os
import matplotlib.pyplot as plt
from wfdb import processing 
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.download_db import (
    get_signals,
    get_db,
)

from analyse.utils.global_config import GlobalConfig

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
GlobalConfig(r'../../analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x3281b7580>

# Download data and preprocess it

In [2]:
url = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name = "MIT-BIH-AtrialFibrillation"

db_path = get_db(url, name, "../../analyse/data/")

signals = get_signals(db_path, reload=False)

In [3]:
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

2294582


# Machine learning using Random Forest Classifier

In [4]:
from sklearn.model_selection import train_test_split

import pandas as pd


windows_pd = pd.DataFrame(windows)
classification_pd = pd.DataFrame(classification)

X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=0)

print(len(X_train) / len(X_test))


2.9999965135292497


In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=GlobalConfig.get("rf_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)

In [40]:
rf_model = rf.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=50;, score=0.986 total time= 2.2min
[CV 5/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=50;, score=0.986 total time= 2.3min
[CV 4/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=50;, score=0.986 total time= 2.3min
[CV 3/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=50;, score=0.986 total time= 2.3min
[CV 2/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=50;, score=0.986 total time= 2.4min
[CV 1/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=100;, score=0.986 total time= 4.4min
[CV 2/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=100;, score=0.986 total time= 4.4min
[CV 3/5] END bootstrap=True, max_depth=6, max_features=auto, n_estimators=100;, score=0.986 total time= 4.5min
[CV 1/5] END bootstrap=True, max_depth=6, max_features=

KeyboardInterrupt: 

In [8]:
mode_file_name = "../../analyse/models/RandomForestClassifier.pickle" 
 
with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        rf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

# TESTS

**Check test dataset**

In [9]:
print(rf_model.score(X_test, y_test))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.7s


0.9610491487781663


[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.2s finished


**Create some more datasets for testing and get mean**

In [10]:
results = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=i)
    results.append(rf_model.score(X_test, y_test))

print(np.mean(results))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 112 tasks      |

0.9612096833238617


[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.2s finished


**Let's try to create dataset full of ecg siganls with defects**

In [11]:
only_bad_window = []
only_bad_classification = []
for i in range(len(windows)):
    if classification[i] == 1:
        only_bad_window.append(windows[i])
        only_bad_classification.append(1)

only_bad_window = pd.DataFrame(only_bad_window)
only_bad_classification = pd.DataFrame(only_bad_classification)

print(len(only_bad_window))

1071632


In [12]:
print(
    len([i for i in rf_model.predict(only_bad_window) if i == 1]) / len(only_bad_window)
)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.4s finished


0.9574863385938457


**Let's try to create dataset full of ecg siganls without defects**

In [13]:
only_good_window = []
only_good_classification = []
for i in range(len(windows)):
    if classification[i] == 0:
        only_good_window.append(windows[i])
        only_good_classification.append(1)

only_good_window = pd.DataFrame(only_good_window)
only_good_classification = pd.DataFrame(only_good_classification)

print(len(only_good_window))

1222950


In [14]:
print(
    len([i for i in rf_model.predict(only_good_window) if i == 0]) / len(only_good_window)
)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    1.6s finished


0.9644670673371765


**Check all ecg signals at once**

In [15]:
print(rf_model.score(windows_pd, classification_pd))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    1.8s


0.9612068777668438


[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    3.0s finished
