In [1]:
import sys
import wfdb
import os
import matplotlib.pyplot as plt
from wfdb import processing 
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.download_db import (
    get_signals,
    get_db,
)

from analyse.utils.global_config import GlobalConfig

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)

# Download data and preprocess it

In [2]:
url = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name = "MIT-BIH-AtrialFibrillation"

db_path = get_db(url, name, "../../analyse/data/")

signals = get_signals(db_path, reload=False)

In [None]:
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

2294582


# Machine learning using Random Forest Classifier

In [6]:
from sklearn.model_selection import train_test_split

import pandas as pd


windows_pd = pd.DataFrame(windows)
classification_pd = pd.DataFrame(classification)

X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=0)

print(len(X_train) / len(X_test))


2.9999965135292497


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=GlobalConfig.get("rf_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)

In [8]:
rf_model = rf.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=50;, score=0.986 total time= 1.1min
[CV 5/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=50;, score=0.985 total time= 1.1min
[CV 2/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=50;, score=0.986 total time= 1.1min
[CV 4/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=50;, score=0.986 total time= 1.2min
[CV 3/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=50;, score=0.986 total time= 1.2min
[CV 4/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=100;, score=0.986 total time= 2.2min
[CV 2/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=100;, score=0.986 total time= 2.2min
[CV 3/5] END bootstrap=True, max_depth=6, max_features=sqrt, n_estimators=100;, score=0.986 total time= 2.2min
[CV 5/5] END bootstrap=True, max_depth=6, max_features=

In [9]:
mode_file_name = "../../analyse/models/RandomForestClassifier.pickle" 

with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        rf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

# TESTS

**Check test dataset**

In [10]:
print(rf_model.score(X_test, y_test))

0.9905495956291254


**Create some more datasets for testing and get mean**

In [11]:
results = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=i)
    results.append(rf_model.score(X_test, y_test))

print(np.mean(results))

0.9907657314916571


**Let's try to create dataset full of ecg siganls with defects**

In [12]:
only_bad_window = []
only_bad_classification = []
for i in range(len(windows)):
    if classification[i] == 1:
        only_bad_window.append(windows[i])
        only_bad_classification.append(1)

only_bad_window = pd.DataFrame(only_bad_window)
only_bad_classification = pd.DataFrame(only_bad_classification)

print(len(only_bad_window))

1071632


In [13]:
print(
    len([i for i in rf_model.predict(only_bad_window) if i == 1]) / len(only_bad_window)
)


0.9608335697328934


**Let's try to create dataset full of ecg siganls without defects**

In [14]:
only_good_window = []
only_good_classification = []
for i in range(len(windows)):
    if classification[i] == 0:
        only_good_window.append(windows[i])
        only_good_classification.append(1)

only_good_window = pd.DataFrame(only_good_window)
only_good_classification = pd.DataFrame(only_good_classification)

print(len(only_good_window))

1222950


In [15]:
print(
    len([i for i in rf_model.predict(only_good_window) if i == 0]) / len(only_good_window)
)

0.9698474998977882


**Check all ecg signals at once**

In [16]:
print(rf_model.score(windows_pd, classification_pd))

0.9907777098435466
